author: Kip Macy <kmacy@FreeBSD.org> 2008-07-28 23:37:33 +0000
committer: Kip Macy <kmacy@FreeBSD.org> 2008-07-28 23:37:33 +0000
commit: 6971fe8ddf2f0e170067a422e5f827724410bef9 (patch)
tree: 8fd6cc6e7404202400d3d5f758a8f3b65766b0f4
parent: 3ccd11b631cb9868dc43b7d5c815100a17bd8d9e (diff)
59 files changed, 20553 insertions, 199 deletions
diff --git a/sys/conf/files b/sys/conf/files
index f71067411abd9..9672a92f10340 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -527,6 +527,7 @@ dev/cxgb/common/cxgb_ael1002.c	optional cxgb pci
 dev/cxgb/common/cxgb_mv88e1xxx.c	optional cxgb pci
 dev/cxgb/common/cxgb_xgmac.c	optional cxgb pci
 dev/cxgb/common/cxgb_t3_hw.c	optional cxgb pci
+dev/cxgb/common/cxgb_tn1010.c	optional cxgb pci	
 dev/cxgb/sys/uipc_mvec.c	optional cxgb pci
 dev/cxgb/sys/cxgb_support.c	optional cxgb pci
 dev/cxgb/cxgb_t3fw.c		 optional cxgb cxgb_t3fw
diff --git a/sys/dev/cxgb/common/cxgb_ael1002.c b/sys/dev/cxgb/common/cxgb_ael1002.c
index b288d5d60535a..a9c7fb2d86770 100644
--- a/sys/dev/cxgb/common/cxgb_ael1002.c
+++ b/sys/dev/cxgb/common/cxgb_ael1002.c
@@ -46,11 +46,32 @@ enum {
 	AEL1002_PWR_DOWN_LO = 0xc012,
 	AEL1002_XFI_EQL     = 0xc015,
 	AEL1002_LB_EN       = 0xc017,
+	AEL_OPT_SETTINGS    = 0xc017,
+};
 
-	LASI_CTRL   = 0x9002,
-	LASI_STAT   = 0x9005
+struct reg_val {
+	unsigned short mmd_addr;
+	unsigned short reg_addr;
+	unsigned short clear_bits;
+	unsigned short set_bits;
 };
 
+static int set_phy_regs(struct cphy *phy, const struct reg_val *rv)
+{
+	int err;
+
+	for (err = 0; rv->mmd_addr && !err; rv++) {
+		if (rv->clear_bits == 0xffff)
+			err = mdio_write(phy, rv->mmd_addr, rv->reg_addr,
+					 rv->set_bits);
+		else
+			err = t3_mdio_change_bits(phy, rv->mmd_addr,
+						  rv->reg_addr, rv->clear_bits,
+						  rv->set_bits);
+	}
+	return err;
+}
+
 static void ael100x_txon(struct cphy *phy)
 {
 	int tx_on_gpio = phy->addr == 0 ? F_GPIO7_OUT_VAL : F_GPIO2_OUT_VAL;
@@ -158,33 +179,6 @@ static int ael1006_reset(struct cphy *phy, int wait)
 	return t3_phy_reset(phy, MDIO_DEV_PMA_PMD, wait);
 }
 
-static int ael1006_intr_enable(struct cphy *phy)
-{
-	return mdio_write(phy, MDIO_DEV_PMA_PMD, LASI_CTRL, 1);
-}
-
-static int ael1006_intr_disable(struct cphy *phy)
-{
-	return mdio_write(phy, MDIO_DEV_PMA_PMD, LASI_CTRL, 0);
-}
-
-static int ael1006_intr_clear(struct cphy *phy)
-{
-	u32 val;
-
-	return mdio_read(phy, MDIO_DEV_PMA_PMD, LASI_STAT, &val);
-}
-
-static int ael1006_intr_handler(struct cphy *phy)
-{
-	unsigned int status;
-	int err = mdio_read(phy, MDIO_DEV_PMA_PMD, LASI_STAT, &status);
-
-	if (err)
-		return err;
-	return (status & 1) ?  cphy_cause_link_change : 0;
-}
-
 static int ael1006_power_down(struct cphy *phy, int enable)
 {
 	return t3_mdio_change_bits(phy, MDIO_DEV_PMA_PMD, MII_BMCR,
@@ -194,10 +188,10 @@ static int ael1006_power_down(struct cphy *phy, int enable)
 #ifdef C99_NOT_SUPPORTED
 static struct cphy_ops ael1006_ops = {
 	ael1006_reset,
-	ael1006_intr_enable,
-	ael1006_intr_disable,
-	ael1006_intr_clear,
-	ael1006_intr_handler,
+	t3_phy_lasi_intr_enable,
+	t3_phy_lasi_intr_disable,
+	t3_phy_lasi_intr_clear,
+	t3_phy_lasi_intr_handler,
 	NULL,
 	NULL,
 	NULL,
@@ -209,10 +203,10 @@ static struct cphy_ops ael1006_ops = {
 #else
 static struct cphy_ops ael1006_ops = {
 	.reset           = ael1006_reset,
-	.intr_enable     = ael1006_intr_enable,
-	.intr_disable    = ael1006_intr_disable,
-	.intr_clear      = ael1006_intr_clear,
-	.intr_handler    = ael1006_intr_handler,
+	.intr_enable     = t3_phy_lasi_intr_enable,
+	.intr_disable    = t3_phy_lasi_intr_disable,
+	.intr_clear      = t3_phy_lasi_intr_clear,
+	.intr_handler    = t3_phy_lasi_intr_handler,
 	.get_link_status = ael100x_get_link_status,
 	.power_down      = ael1006_power_down,
 };
@@ -228,13 +222,382 @@ int t3_ael1006_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
 	return 0;
 }
 
+static int ael2005_setup_sr_edc(struct cphy *phy)
+{
+	static u16 sr_edc[] = {
+		0xcc00, 0x2ff4,
+		0xcc01, 0x3cd4,
+		0xcc02, 0x2015,
+		0xcc03, 0x3105,
+		0xcc04, 0x6524,
+		0xcc05, 0x27ff,
+		0xcc06, 0x300f,
+		0xcc07, 0x2c8b,
+		0xcc08, 0x300b,
+		0xcc09, 0x4009,
+		0xcc0a, 0x400e,
+		0xcc0b, 0x2f72,
+		0xcc0c, 0x3002,
+		0xcc0d, 0x1002,
+		0xcc0e, 0x2172,
+		0xcc0f, 0x3012,
+		0xcc10, 0x1002,
+		0xcc11, 0x25d2,
+		0xcc12, 0x3012,
+		0xcc13, 0x1002,
+		0xcc14, 0xd01e,
+		0xcc15, 0x27d2,
+		0xcc16, 0x3012,
+		0xcc17, 0x1002,
+		0xcc18, 0x2004,
+		0xcc19, 0x3c84,
+		0xcc1a, 0x6436,
+		0xcc1b, 0x2007,
+		0xcc1c, 0x3f87,
+		0xcc1d, 0x8676,
+		0xcc1e, 0x40b7,
+		0xcc1f, 0xa746,
+		0xcc20, 0x4047,
+		0xcc21, 0x5673,
+		0xcc22, 0x2982,
+		0xcc23, 0x3002,
+		0xcc24, 0x13d2,
+		0xcc25, 0x8bbd,
+		0xcc26, 0x2862,
+		0xcc27, 0x3012,
+		0xcc28, 0x1002,
+		0xcc29, 0x2092,
+		0xcc2a, 0x3012,
+		0xcc2b, 0x1002,
+		0xcc2c, 0x5cc3,
+		0xcc2d, 0x314,
+		0xcc2e, 0x2942,
+		0xcc2f, 0x3002,
+		0xcc30, 0x1002,
+		0xcc31, 0xd019,
+		0xcc32, 0x2032,
+		0xcc33, 0x3012,
+		0xcc34, 0x1002,
+		0xcc35, 0x2a04,
+		0xcc36, 0x3c74,
+		0xcc37, 0x6435,
+		0xcc38, 0x2fa4,
+		0xcc39, 0x3cd4,
+		0xcc3a, 0x6624,
+		0xcc3b, 0x5563,
+		0xcc3c, 0x2d42,
+		0xcc3d, 0x3002,
+		0xcc3e, 0x13d2,
+		0xcc3f, 0x464d,
+		0xcc40, 0x2862,
+		0xcc41, 0x3012,
+		0xcc42, 0x1002,
+		0xcc43, 0x2032,
+		0xcc44, 0x3012,
+		0xcc45, 0x1002,
+		0xcc46, 0x2fb4,
+		0xcc47, 0x3cd4,
+		0xcc48, 0x6624,
+		0xcc49, 0x5563,
+		0xcc4a, 0x2d42,
+		0xcc4b, 0x3002,
+		0xcc4c, 0x13d2,
+		0xcc4d, 0x2ed2,
+		0xcc4e, 0x3002,
+		0xcc4f, 0x1002,
+		0xcc50, 0x2fd2,
+		0xcc51, 0x3002,
+		0xcc52, 0x1002,
+		0xcc53, 0x004,
+		0xcc54, 0x2942,
+		0xcc55, 0x3002,
+		0xcc56, 0x1002,
+		0xcc57, 0x2092,
+		0xcc58, 0x3012,
+		0xcc59, 0x1002,
+		0xcc5a, 0x5cc3,
+		0xcc5b, 0x317,
+		0xcc5c, 0x2f72,
+		0xcc5d, 0x3002,
+		0xcc5e, 0x1002,
+		0xcc5f, 0x2942,
+		0xcc60, 0x3002,
+		0xcc61, 0x1002,
+		0xcc62, 0x22cd,
+		0xcc63, 0x301d,
+		0xcc64, 0x2862,
+		0xcc65, 0x3012,
+		0xcc66, 0x1002,
+		0xcc67, 0x2ed2,
+		0xcc68, 0x3002,
+		0xcc69, 0x1002,
+		0xcc6a, 0x2d72,
+		0xcc6b, 0x3002,
+		0xcc6c, 0x1002,
+		0xcc6d, 0x628f,
+		0xcc6e, 0x2112,
+		0xcc6f, 0x3012,
+		0xcc70, 0x1002,
+		0xcc71, 0x5aa3,
+		0xcc72, 0x2dc2,
+		0xcc73, 0x3002,
+		0xcc74, 0x1312,
+		0xcc75, 0x6f72,
+		0xcc76, 0x1002,
+		0xcc77, 0x2807,
+		0xcc78, 0x31a7,
+		0xcc79, 0x20c4,
+		0xcc7a, 0x3c24,
+		0xcc7b, 0x6724,
+		0xcc7c, 0x1002,
+		0xcc7d, 0x2807,
+		0xcc7e, 0x3187,
+		0xcc7f, 0x20c4,
+		0xcc80, 0x3c24,
+		0xcc81, 0x6724,
+		0xcc82, 0x1002,
+		0xcc83, 0x2514,
+		0xcc84, 0x3c64,
+		0xcc85, 0x6436,
+		0xcc86, 0xdff4,
+		0xcc87, 0x6436,
+		0xcc88, 0x1002,
+		0xcc89, 0x40a4,
+		0xcc8a, 0x643c,
+		0xcc8b, 0x4016,
+		0xcc8c, 0x8c6c,
+		0xcc8d, 0x2b24,
+		0xcc8e, 0x3c24,
+		0xcc8f, 0x6435,
+		0xcc90, 0x1002,
+		0xcc91, 0x2b24,
+		0xcc92, 0x3c24,
+		0xcc93, 0x643a,
+		0xcc94, 0x4025,
+		0xcc95, 0x8a5a,
+		0xcc96, 0x1002,
+		0xcc97, 0x2731,
+		0xcc98, 0x3011,
+		0xcc99, 0x1001,
+		0xcc9a, 0xc7a0,
+		0xcc9b, 0x100,
+		0xcc9c, 0xc502,
+		0xcc9d, 0x53ac,
+		0xcc9e, 0xc503,
+		0xcc9f, 0xd5d5,
+		0xcca0, 0xc600,
+		0xcca1, 0x2a6d,
+		0xcca2, 0xc601,
+		0xcca3, 0x2a4c,
+		0xcca4, 0xc602,
+		0xcca5, 0x111,
+		0xcca6, 0xc60c,
+		0xcca7, 0x5900,
+		0xcca8, 0xc710,
+		0xcca9, 0x700,
+		0xccaa, 0xc718,
+		0xccab, 0x700,
+		0xccac, 0xc720,
+		0xccad, 0x4700,
+		0xccae, 0xc801,
+		0xccaf, 0x7f50,
+		0xccb0, 0xc802,
+		0xccb1, 0x7760,
+		0xccb2, 0xc803,
+		0xccb3, 0x7fce,
+		0xccb4, 0xc804,
+		0xccb5, 0x5700,
+		0xccb6, 0xc805,
+		0xccb7, 0x5f11,
+		0xccb8, 0xc806,
+		0xccb9, 0x4751,
+		0xccba, 0xc807,
+		0xccbb, 0x57e1,
+		0xccbc, 0xc808,
+		0xccbd, 0x2700,
+		0xccbe, 0xc809,
+		0xccbf, 0x000,
+		0xccc0, 0xc821,
+		0xccc1, 0x002,
+		0xccc2, 0xc822,
+		0xccc3, 0x014,
+		0xccc4, 0xc832,
+		0xccc5, 0x1186,
+		0xccc6, 0xc847,
+		0xccc7, 0x1e02,
+		0xccc8, 0xc013,
+		0xccc9, 0xf341,
+		0xccca, 0xc01a,
+		0xcccb, 0x446,
+		0xcccc, 0xc024,
+		0xcccd, 0x1000,
+		0xccce, 0xc025,
+		0xcccf, 0xa00,
+		0xccd0, 0xc026,
+		0xccd1, 0xc0c,
+		0xccd2, 0xc027,
+		0xccd3, 0xc0c,
+		0xccd4, 0xc029,
+		0xccd5, 0x0a0,
+		0xccd6, 0xc030,
+		0xccd7, 0xa00,
+		0xccd8, 0xc03c,
+		0xccd9, 0x01c,
+		0xccda, 0xc005,
+		0xccdb, 0x7a06,
+		0xccdc, 0x000,
+		0xccdd, 0x2731,
+		0xccde, 0x3011,
+		0xccdf, 0x1001,
+		0xcce0, 0xc620,
+		0xcce1, 0x000,
+		0xcce2, 0xc621,
+		0xcce3, 0x03f,
+		0xcce4, 0xc622,
+		0xcce5, 0x000,
+		0xcce6, 0xc623,
+		0xcce7, 0x000,
+		0xcce8, 0xc624,
+		0xcce9, 0x000,
+		0xccea, 0xc625,
+		0xcceb, 0x000,
+		0xccec, 0xc627,
+		0xcced, 0x000,
+		0xccee, 0xc628,
+		0xccef, 0x000,
+		0xccf0, 0xc62c,
+		0xccf1, 0x000,
+		0xccf2, 0x000,
+		0xccf3, 0x2806,
+		0xccf4, 0x3cb6,
+		0xccf5, 0xc161,
+		0xccf6, 0x6134,
+		0xccf7, 0x6135,
+		0xccf8, 0x5443,
+		0xccf9, 0x303,
+		0xccfa, 0x6524,
+		0xccfb, 0x00b,
+		0xccfc, 0x1002,
+		0xccfd, 0x2104,
+		0xccfe, 0x3c24,
+		0xccff, 0x2105,
+		0xcd00, 0x3805,
+		0xcd01, 0x6524,
+		0xcd02, 0xdff4,
+		0xcd03, 0x4005,
+		0xcd04, 0x6524,
+		0xcd05, 0x1002,
+		0xcd06, 0x5dd3,
+		0xcd07, 0x306,
+		0xcd08, 0x2ff7,
+		0xcd09, 0x38f7,
+		0xcd0a, 0x60b7,
+		0xcd0b, 0xdffd,
+		0xcd0c, 0x00a,
+		0xcd0d, 0x1002,
+		0xcd0e, 0
+	};
+	int i, err;
+
+	for (err = i = 0; i < ARRAY_SIZE(sr_edc) && !err; i += 2)
+		err = mdio_write(phy, MDIO_DEV_PMA_PMD, sr_edc[i],
+				 sr_edc[i + 1]);
+	return err;
+}
+
+static int ael2005_reset(struct cphy *phy, int wait)
+{
+	static struct reg_val regs0[] = {
+		{ MDIO_DEV_PMA_PMD, 0xc001, 0, 1 << 5 },
+		{ MDIO_DEV_PMA_PMD, 0xc017, 0, 1 << 5 },
+		{ MDIO_DEV_PMA_PMD, 0xc013, 0xffff, 0xf341 },
+		{ MDIO_DEV_PMA_PMD, 0xc210, 0xffff, 0x8000 },
+		{ MDIO_DEV_PMA_PMD, 0xc210, 0xffff, 0x8100 },
+		{ MDIO_DEV_PMA_PMD, 0xc210, 0xffff, 0x8000 },
+		{ MDIO_DEV_PMA_PMD, 0xc210, 0xffff, 0 },
+		{ 0, 0, 0, 0 }
+	};
+	static struct reg_val regs1[] = {
+		{ MDIO_DEV_PMA_PMD, 0xc003, 0xffff, 0x181 },
+		{ MDIO_DEV_PMA_PMD, 0xc010, 0xffff, 0x448a },
+		{ MDIO_DEV_PMA_PMD, 0xc04a, 0xffff, 0x5200 },
+		{ 0, 0, 0, 0 }
+	};
+	static struct reg_val regs2[] = {
+		{ MDIO_DEV_PMA_PMD, 0xca00, 0xffff, 0x0080 },
+		{ MDIO_DEV_PMA_PMD, 0xca12, 0xffff, 0 },
+		{ 0, 0, 0, 0 }
+	};
+
+	int err;
+
+	err = t3_phy_reset(phy, MDIO_DEV_PMA_PMD, 0);
+	if (err)
+		return err;
+
+	msleep(125);
+	err = set_phy_regs(phy, regs0);
+	if (err)
+		return err;
+
+	msleep(50);
+	err = set_phy_regs(phy, regs1);
+	if (err)
+		return err;
+
+	msleep(50);
+	err = ael2005_setup_sr_edc(phy);
+	if (err)
+		return err;
+
+	return set_phy_regs(phy, regs2);
+}
+
+#ifdef C99_NOT_SUPPORTED
+static struct cphy_ops ael2005_ops = {
+	ael2005_reset,
+	t3_phy_lasi_intr_enable,
+	t3_phy_lasi_intr_disable,
+	t3_phy_lasi_intr_clear,
+	t3_phy_lasi_intr_handler,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	ael100x_get_link_status,
+	ael1002_power_down,
+};
+#else
+static struct cphy_ops ael2005_ops = {
+	.reset           = ael2005_reset,
+	.intr_enable     = t3_phy_lasi_intr_enable,
+	.intr_disable    = t3_phy_lasi_intr_disable,
+	.intr_clear      = t3_phy_lasi_intr_clear,
+	.intr_handler    = t3_phy_lasi_intr_handler,
+	.get_link_status = ael100x_get_link_status,
+	.power_down      = ael1002_power_down,
+};
+#endif
+
+int t3_ael2005_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
+			const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &ael2005_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE,
+		  "10GBASE-R");
+	msleep(125);
+	return t3_mdio_change_bits(phy, MDIO_DEV_PMA_PMD, AEL_OPT_SETTINGS, 0,
+				   1 << 5);
+}
+
 #ifdef C99_NOT_SUPPORTED
 static struct cphy_ops qt2045_ops = {
 	ael1006_reset,
-	ael1006_intr_enable,
-	ael1006_intr_disable,
-	ael1006_intr_clear,
-	ael1006_intr_handler,
+	t3_phy_lasi_intr_enable,
+	t3_phy_lasi_intr_disable,
+	t3_phy_lasi_intr_clear,
+	t3_phy_lasi_intr_handler,
 	NULL,
 	NULL,
 	NULL,
@@ -246,10 +609,10 @@ static struct cphy_ops qt2045_ops = {
 #else
 static struct cphy_ops qt2045_ops = {
 	.reset           = ael1006_reset,
-	.intr_enable     = ael1006_intr_enable,
-	.intr_disable    = ael1006_intr_disable,
-	.intr_clear      = ael1006_intr_clear,
-	.intr_handler    = ael1006_intr_handler,
+	.intr_enable     = t3_phy_lasi_intr_enable,
+	.intr_disable    = t3_phy_lasi_intr_disable,
+	.intr_clear      = t3_phy_lasi_intr_clear,
+	.intr_handler    = t3_phy_lasi_intr_handler,
 	.get_link_status = ael100x_get_link_status,
 	.power_down      = ael1006_power_down,
 };
diff --git a/sys/dev/cxgb/common/cxgb_common.h b/sys/dev/cxgb/common/cxgb_common.h
index 1ce6b4016b493..9ac28945533a2 100644
--- a/sys/dev/cxgb/common/cxgb_common.h
+++ b/sys/dev/cxgb/common/cxgb_common.h
@@ -47,10 +47,7 @@ enum {
 	NCCTRL_WIN     = 32,    /* # of congestion control windows */
 	NTX_SCHED      = 8,     /* # of HW Tx scheduling queues */
 	PROTO_SRAM_LINES = 128, /* size of protocol sram */
-	MAX_NPORTS     = 4,
-	TP_TMR_RES     = 200,
-	TP_SRAM_OFFSET = 4096,	/* TP SRAM content offset in eeprom */
-	TP_SRAM_LEN    = 2112,	/* TP SRAM content offset in eeprom */
+	EXACT_ADDR_FILTERS = 8,	/* # of HW exact match filters */
 };
 
 #define MAX_RX_COALESCING_LEN 12288U
@@ -122,8 +119,8 @@ enum {
 };
 
 struct sg_ent {                   /* SGE scatter/gather entry */
-	u32 len[2];
-	u64 addr[2];
+	__be32 len[2];
+	__be64 addr[2];
 };
 
 #ifndef SGE_NUM_GENBITS
@@ -150,7 +147,7 @@ struct adapter_info {
 	unsigned char          mdien:1;
 	unsigned char          mdiinv:1;
 	unsigned int           gpio_out;       /* GPIO output settings */
-	unsigned int           gpio_intr;      /* GPIO IRQ enable mask */
+	unsigned char gpio_intr[MAX_NPORTS];   /* GPIO PHY IRQ pins */
 	unsigned long          caps;           /* adapter capabilities */
 	const struct mdio_ops *mdio_ops;       /* MDIO operations */
 	const char            *desc;           /* product description */
@@ -159,8 +156,6 @@ struct adapter_info {
 struct port_type_info {
 	int (*phy_prep)(struct cphy *phy, adapter_t *adapter, int phy_addr,
 			const struct mdio_ops *ops);
-
-
 };
 
 struct mc5_stats {
@@ -307,7 +302,7 @@ struct tp_params {
 struct qset_params {                   /* SGE queue set parameters */
 	unsigned int polling;          /* polling/interrupt service for rspq */
 	unsigned int lro;              /* large receive offload */
-	unsigned int coalesce_nsecs;   /* irq coalescing timer */
+	unsigned int coalesce_usecs;   /* irq coalescing timer */
 	unsigned int rspq_size;        /* # of entries in response queue */
 	unsigned int fl_size;          /* # of entries in regular free list */
 	unsigned int jumbo_size;       /* # of entries in jumbo free list */
@@ -486,12 +481,25 @@ enum {
 	MAC_RXFIFO_SIZE  = 32768
 };
 
-/* IEEE 802.3ae specified MDIO devices */
+/* IEEE 802.3 specified MDIO devices */
 enum {
 	MDIO_DEV_PMA_PMD = 1,
 	MDIO_DEV_WIS     = 2,
 	MDIO_DEV_PCS     = 3,
-	MDIO_DEV_XGXS    = 4
+	MDIO_DEV_XGXS    = 4,
+	MDIO_DEV_ANEG    = 7,
+	MDIO_DEV_VEND1   = 30,
+	MDIO_DEV_VEND2   = 31
+};
+
+/* LASI control and status registers */
+enum {
+	RX_ALARM_CTRL = 0x9000,
+	TX_ALARM_CTRL = 0x9001,
+	LASI_CTRL     = 0x9002,
+	RX_ALARM_STAT = 0x9003,
+	TX_ALARM_STAT = 0x9004,
+	LASI_STAT     = 0x9005
 };
 
 /* PHY loopback direction */
@@ -556,8 +564,8 @@ static inline int mdio_write(struct cphy *phy, int mmd, int reg,
 /* Convenience initializer */
 static inline void cphy_init(struct cphy *phy, adapter_t *adapter,
 			     int phy_addr, struct cphy_ops *phy_ops,
-                             const struct mdio_ops *mdio_ops, unsigned int caps,
-                             const char *desc)
+			     const struct mdio_ops *mdio_ops, unsigned int caps,
+			     const char *desc)
 {
 	phy->adapter = adapter;
 	phy->addr    = phy_addr;
@@ -651,7 +659,12 @@ int t3_mdio_change_bits(struct cphy *phy, int mmd, int reg, unsigned int clear,
 			unsigned int set);
 int t3_phy_reset(struct cphy *phy, int mmd, int wait);
 int t3_phy_advertise(struct cphy *phy, unsigned int advert);
+int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert);
 int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex);
+int t3_phy_lasi_intr_enable(struct cphy *phy);
+int t3_phy_lasi_intr_disable(struct cphy *phy);
+int t3_phy_lasi_intr_clear(struct cphy *phy);
+int t3_phy_lasi_intr_handler(struct cphy *phy);
 
 void t3_intr_enable(adapter_t *adapter);
 void t3_intr_disable(adapter_t *adapter);
@@ -673,10 +686,10 @@ int t3_read_flash(adapter_t *adapter, unsigned int addr, unsigned int nwords,
 int t3_get_tp_version(adapter_t *adapter, u32 *vers);
 int t3_check_tpsram_version(adapter_t *adapter, int *must_load);
 int t3_check_tpsram(adapter_t *adapter, const u8 *tp_ram, unsigned int size);
-int t3_load_fw(adapter_t *adapter, const const u8 *fw_data, unsigned int size);
-int t3_load_boot(adapter_t *adapter, u8 *boot_data, unsigned int size);
+int t3_load_fw(adapter_t *adapter, const u8 *fw_data, unsigned int size);
 int t3_get_fw_version(adapter_t *adapter, u32 *vers);
 int t3_check_fw_version(adapter_t *adapter, int *must_load);
+int t3_load_boot(adapter_t *adapter, u8 *fw_data, unsigned int size);
 int t3_init_hw(adapter_t *adapter, u32 fw_params);
 void mac_prep(struct cmac *mac, adapter_t *adapter, int index);
 void early_hw_init(adapter_t *adapter, const struct adapter_info *ai);
@@ -684,8 +697,8 @@ int t3_prep_adapter(adapter_t *adapter, const struct adapter_info *ai, int reset
 void t3_led_ready(adapter_t *adapter);
 void t3_fatal_err(adapter_t *adapter);
 void t3_set_vlan_accel(adapter_t *adapter, unsigned int ports, int on);
-void t3_tp_set_offload_mode(adapter_t *adap, int enable);
 void t3_enable_filters(adapter_t *adap);
+void t3_tp_set_offload_mode(adapter_t *adap, int enable);
 void t3_config_rss(adapter_t *adapter, unsigned int rss_config, const u8 *cpus,
 		   const u16 *rspq);
 int t3_read_rss(adapter_t *adapter, u8 *lkup, u16 *map);
@@ -719,7 +732,7 @@ void t3_mc5_intr_handler(struct mc5 *mc5);
 int t3_read_mc5_range(const struct mc5 *mc5, unsigned int start, unsigned int n,
 		      u32 *buf);
 
-#if defined(CONFIG_CHELSIO_T3_CORE)
+#ifdef CONFIG_CHELSIO_T3_CORE
 int t3_tp_set_coalescing_size(adapter_t *adap, unsigned int size, int psh);
 void t3_tp_set_max_rxsize(adapter_t *adap, unsigned int size);
 void t3_tp_get_mib_stats(adapter_t *adap, struct tp_mib_stats *tps);
@@ -774,21 +787,22 @@ int t3_vsc7323_set_mtu(adapter_t *adap, unsigned int mtu, int port);
 int t3_vsc7323_set_addr(adapter_t *adap, u8 addr[6], int port);
 int t3_vsc7323_enable(adapter_t *adap, int port, int which);
 int t3_vsc7323_disable(adapter_t *adap, int port, int which);
-
-int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert);
-
 const struct mac_stats *t3_vsc7323_update_stats(struct cmac *mac);
 
 int t3_mv88e1xxx_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
-			   const struct mdio_ops *mdio_ops);
+			  const struct mdio_ops *mdio_ops);
 int t3_vsc8211_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
-			 const struct mdio_ops *mdio_ops);
+			const struct mdio_ops *mdio_ops);
 int t3_ael1002_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
-			 const struct mdio_ops *mdio_ops);
+			const struct mdio_ops *mdio_ops);
 int t3_ael1006_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
-			 const struct mdio_ops *mdio_ops);
-int t3_qt2045_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
 			const struct mdio_ops *mdio_ops);
+int t3_ael2005_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
+			const struct mdio_ops *mdio_ops);
+int t3_qt2045_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
+		       const struct mdio_ops *mdio_ops);
+int t3_tn1010_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
+		       const struct mdio_ops *mdio_ops);
 int t3_xaui_direct_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
-			     const struct mdio_ops *mdio_ops);
+			    const struct mdio_ops *mdio_ops);
 #endif /* __CHELSIO_COMMON_H */
diff --git a/sys/dev/cxgb/common/cxgb_mc5.c b/sys/dev/cxgb/common/cxgb_mc5.c
index 0e40aca8880e0..6f1537c680843 100644
--- a/sys/dev/cxgb/common/cxgb_mc5.c
+++ b/sys/dev/cxgb/common/cxgb_mc5.c
@@ -326,9 +326,16 @@ static void mc5_dbgi_mode_disable(const struct mc5 *mc5)
 			 V_PRTYEN(mc5->parity_enabled) | F_MBUSEN);
 }
 
-/*
- * Initialization that requires the OS and protocol layers to already
- * be intialized goes here.
+/**
+ *	t3_mc5_init - initialize MC5 and the TCAM
+ *	@mc5: the MC5 handle
+ *	@nservers: desired number the TCP servers (listening ports)
+ *	@nfilters: desired number of HW filters (classifiers)
+ *	@nroutes: desired number of routes
+ *
+ *	Initialize MC5 and the TCAM and partition the TCAM for the requested
+ *	number of servers, filters, and routes.  The number of routes is
+ *	typically 0 except for specialized uses of the T3 adapters.
  */
 int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters,
 		unsigned int nroutes)
@@ -344,7 +351,7 @@ int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters,
 	if (nroutes > MAX_ROUTES || nroutes + nservers + nfilters > tcam_size)
 		return -EINVAL;
 
-	if (nfilters && adap->params.rev < T3_REV_C)
+	if (nfilters)
 		mc5->parity_enabled = 0;
 
 	/* Reset the TCAM */
@@ -420,7 +427,7 @@ int t3_read_mc5_range(const struct mc5 *mc5, unsigned int start,
 	}
 
 	mc5_dbgi_mode_disable(mc5);
-	return 0;
+	return err;
 }
 
 #define MC5_INT_FATAL (F_PARITYERR | F_REQQPARERR | F_DISPQPARERR)
@@ -465,7 +472,6 @@ void t3_mc5_intr_handler(struct mc5 *mc5)
 	t3_write_reg(adap, A_MC5_DB_INT_CAUSE, cause);
 }
 
-
 /**
  *	t3_mc5_prep - initialize the SW state for MC5
  *	@adapter: the adapter
diff --git a/sys/dev/cxgb/common/cxgb_mv88e1xxx.c b/sys/dev/cxgb/common/cxgb_mv88e1xxx.c
index 8777b82b2f05f..ab8cce7fdc043 100644
--- a/sys/dev/cxgb/common/cxgb_mv88e1xxx.c
+++ b/sys/dev/cxgb/common/cxgb_mv88e1xxx.c
@@ -299,7 +299,7 @@ static struct cphy_ops mv88e1xxx_ops = {
 #endif
 
 int t3_mv88e1xxx_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
-			   const struct mdio_ops *mdio_ops)
+			  const struct mdio_ops *mdio_ops)
 {
 	int err;
 
@@ -310,9 +310,9 @@ int t3_mv88e1xxx_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
 
 	/* Configure copper PHY transmitter as class A to reduce EMI. */
 	err = mdio_write(phy, 0, MV88E1XXX_EXTENDED_ADDR, 0xb);
-
 	if (!err)
 		err = mdio_write(phy, 0, MV88E1XXX_EXTENDED_DATA, 0x8004);
+
 	if (!err)
 		err = mv88e1xxx_downshift_set(phy, 1);   /* Enable downshift */
 	return err;
diff --git a/sys/dev/cxgb/common/cxgb_t3_cpl.h b/sys/dev/cxgb/common/cxgb_t3_cpl.h
index dd245712cd653..7cd219d222579 100644
--- a/sys/dev/cxgb/common/cxgb_t3_cpl.h
+++ b/sys/dev/cxgb/common/cxgb_t3_cpl.h
@@ -103,6 +103,7 @@ enum CPL_opcode {
 	CPL_RDMA_TERMINATE    = 0xA2,
 	CPL_TRACE_PKT         = 0xA3,
 	CPL_RDMA_EC_STATUS    = 0xA5,
+	CPL_SGE_EC_CR_RETURN  = 0xA6,
 
 	NUM_CPL_CMDS    /* must be last and previous entries must be sorted */
 };
@@ -148,7 +149,8 @@ enum {
 
 enum {
 	CPL_PASS_OPEN_ACCEPT,
-	CPL_PASS_OPEN_REJECT
+	CPL_PASS_OPEN_REJECT,
+	CPL_PASS_OPEN_ACCEPT_TNL
 };
 
 enum {
@@ -907,6 +909,14 @@ struct cpl_wr_ack {
 	__be32 snd_una;
 };
 
+struct cpl_sge_ec_cr_return {
+	RSS_HDR
+	union opcode_tid ot;
+	__be16 sge_ec_id;
+	__u8 cr;
+	__u8 rsvd;
+};
+
 struct cpl_rdma_ec_status {
 	RSS_HDR
 	union opcode_tid ot;
@@ -959,9 +969,11 @@ struct cpl_rx_data {
 	__u8  dack_mode:2;
 	__u8  psh:1;
 	__u8  heartbeat:1;
-	__u8  :4;
+	__u8  ddp_off:1;
+	__u8  :3;
 #else
-	__u8  :4;
+	__u8  :3;
+	__u8  ddp_off:1;
 	__u8  heartbeat:1;
 	__u8  psh:1;
 	__u8  dack_mode:2;
@@ -1129,6 +1141,17 @@ struct cpl_tx_pkt {
 	__be32 len;
 };
 
+struct cpl_tx_pkt_coalesce {
+	__be32 cntrl;
+	__be32 len;
+	__be64 addr;
+};
+
+struct tx_pkt_coalesce_wr {
+	WR_HDR;
+	struct cpl_tx_pkt_coalesce cpl[0];
+};
+
 struct cpl_tx_pkt_lso {
 	WR_HDR;
 	__be32 cntrl;
@@ -1265,7 +1288,8 @@ struct cpl_l2t_write_req {
 	WR_HDR;
 	union opcode_tid ot;
 	__be32 params;
-	__u8  rsvd[2];
+	__u8  rsvd;
+	__u8  port_idx;
 	__u8  dst_mac[6];
 };
 
diff --git a/sys/dev/cxgb/common/cxgb_t3_hw.c b/sys/dev/cxgb/common/cxgb_t3_hw.c
index 29fc328223d6e..acd41c034c571 100644
--- a/sys/dev/cxgb/common/cxgb_t3_hw.c
+++ b/sys/dev/cxgb/common/cxgb_t3_hw.c
@@ -460,32 +460,57 @@ int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex)
 	return mdio_write(phy, 0, MII_BMCR, ctl);
 }
 
+int t3_phy_lasi_intr_enable(struct cphy *phy)
+{
+	return mdio_write(phy, MDIO_DEV_PMA_PMD, LASI_CTRL, 1);
+}
+
+int t3_phy_lasi_intr_disable(struct cphy *phy)
+{
+	return mdio_write(phy, MDIO_DEV_PMA_PMD, LASI_CTRL, 0);
+}
+
+int t3_phy_lasi_intr_clear(struct cphy *phy)
+{
+	u32 val;
+
+	return mdio_read(phy, MDIO_DEV_PMA_PMD, LASI_STAT, &val);
+}
+
+int t3_phy_lasi_intr_handler(struct cphy *phy)
+{
+	unsigned int status;
+	int err = mdio_read(phy, MDIO_DEV_PMA_PMD, LASI_STAT, &status);
+
+	if (err)
+		return err;
+	return (status & 1) ?  cphy_cause_link_change : 0;
+}
+
 static struct adapter_info t3_adap_info[] = {
 	{ 1, 1, 0, 0, 0,
 	  F_GPIO2_OEN | F_GPIO4_OEN |
-	  F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, F_GPIO3 | F_GPIO5,
-	  0,
+	  F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0,
 	  &mi1_mdio_ops, "Chelsio PE9000" },
 	{ 1, 1, 0, 0, 0,
 	  F_GPIO2_OEN | F_GPIO4_OEN |
-	  F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, F_GPIO3 | F_GPIO5,
-	  0,
+	  F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0,
 	  &mi1_mdio_ops, "Chelsio T302" },
 	{ 1, 0, 0, 0, 0,
 	  F_GPIO1_OEN | F_GPIO6_OEN | F_GPIO7_OEN | F_GPIO10_OEN |
 	  F_GPIO11_OEN | F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL,
-	  0, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	  { 0 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
 	  &mi1_mdio_ext_ops, "Chelsio T310" },
 	{ 1, 1, 0, 0, 0,
 	  F_GPIO1_OEN | F_GPIO2_OEN | F_GPIO4_OEN | F_GPIO5_OEN | F_GPIO6_OEN |
 	  F_GPIO7_OEN | F_GPIO10_OEN | F_GPIO11_OEN | F_GPIO1_OUT_VAL |
-	  F_GPIO5_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL, 0,
-	  SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	  F_GPIO5_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL,
+	  { S_GPIO9, S_GPIO3 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
 	  &mi1_mdio_ext_ops, "Chelsio T320" },
 	{ 4, 0, 0, 0, 0,
 	  F_GPIO5_OEN | F_GPIO6_OEN | F_GPIO7_OEN | F_GPIO5_OUT_VAL |
 	  F_GPIO6_OUT_VAL | F_GPIO7_OUT_VAL,
-	  F_GPIO1 | F_GPIO2 | F_GPIO3 | F_GPIO4, SUPPORTED_AUI,
+	  { S_GPIO1, S_GPIO2, S_GPIO3, S_GPIO4 }, SUPPORTED_AUI,
 	  &mi1_mdio_ops, "Chelsio T304" },
 };
 
@@ -504,10 +529,10 @@ static struct port_type_info port_types[] = {
 	{ t3_vsc8211_phy_prep },
 	{ t3_mv88e1xxx_phy_prep },
 	{ t3_xaui_direct_phy_prep },
-	{ NULL },
+	{ t3_ael2005_phy_prep },
 	{ t3_qt2045_phy_prep },
 	{ t3_ael1006_phy_prep },
-	{ NULL },
+	{ t3_tn1010_phy_prep },
 };
 
 #define VPD_ENTRY(name, len) \
@@ -1231,6 +1256,15 @@ void t3_link_changed(adapter_t *adapter, int port_id)
 
 	phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc);
 
+	if (lc->requested_fc & PAUSE_AUTONEG)
+		fc &= lc->requested_fc;
+	else
+		fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+
+	if (link_ok == lc->link_ok && speed == lc->speed &&
+	    duplex == lc->duplex && fc == lc->fc)
+		return;                            /* nothing changed */
+
 	if (link_ok != lc->link_ok && adapter->params.rev > 0 &&
 	    uses_xaui(adapter)) {
 		if (link_ok)
@@ -1241,10 +1275,6 @@ void t3_link_changed(adapter_t *adapter, int port_id)
 	lc->link_ok = (unsigned char)link_ok;
 	lc->speed = speed < 0 ? SPEED_INVALID : speed;
 	lc->duplex = duplex < 0 ? DUPLEX_INVALID : duplex;
-	if (lc->requested_fc & PAUSE_AUTONEG)
-		fc &= lc->requested_fc;
-	else
-		fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
 
 	if (link_ok && speed >= 0 && lc->autoneg == AUTONEG_ENABLE) {
 		/* Set MAC speed, duplex, and flow control to match PHY. */
@@ -1784,19 +1814,15 @@ static int mac_intr_handler(adapter_t *adap, unsigned int idx)
  */
 int t3_phy_intr_handler(adapter_t *adapter)
 {
-	u32 mask, gpi = adapter_info(adapter)->gpio_intr;
 	u32 i, cause = t3_read_reg(adapter, A_T3DBG_INT_CAUSE);
 
 	for_each_port(adapter, i) {
 		struct port_info *p = adap2pinfo(adapter, i);
 
-		mask = gpi - (gpi & (gpi - 1));
-		gpi -= mask;
-
 		if (!(p->phy.caps & SUPPORTED_IRQ))
 			continue;
 
-		if (cause & mask) {
+		if (cause & (1 << adapter_info(adapter)->gpio_intr[i])) {
 			int phy_cause = p->phy.ops->intr_handler(&p->phy);
 
 			if (phy_cause & cphy_cause_link_change)
@@ -1870,6 +1896,17 @@ int t3_slow_intr_handler(adapter_t *adapter)
 	return 1;
 }
 
+static unsigned int calc_gpio_intr(adapter_t *adap)
+{
+	unsigned int i, gpi_intr = 0;
+
+	for_each_port(adap, i)
+		if ((adap2pinfo(adap, i)->phy.caps & SUPPORTED_IRQ) &&
+		    adapter_info(adap)->gpio_intr[i])
+			gpi_intr |= 1 << adapter_info(adap)->gpio_intr[i];
+	return gpi_intr;
+}
+
 /**
  *	t3_intr_enable - enable interrupts
  *	@adapter: the adapter whose interrupts should be enabled
@@ -1912,10 +1949,8 @@ void t3_intr_enable(adapter_t *adapter)
 		t3_write_reg(adapter, A_ULPTX_INT_ENABLE, ULPTX_INTR_MASK);
 	}
 
-	t3_write_reg(adapter, A_T3DBG_GPIO_ACT_LOW,
-		     adapter_info(adapter)->gpio_intr);
-	t3_write_reg(adapter, A_T3DBG_INT_ENABLE,
-		     adapter_info(adapter)->gpio_intr);
+	t3_write_reg(adapter, A_T3DBG_INT_ENABLE, calc_gpio_intr(adapter));
+
 	if (is_pcie(adapter))
 		t3_write_reg(adapter, A_PCIE_INT_ENABLE, PCIE_INTR_MASK);
 	else
@@ -2561,6 +2596,20 @@ static void tp_wr_bits_indirect(adapter_t *adap, unsigned int addr,
 }
 
 /**
+ *	t3_enable_filters - enable the HW filters
+ *	@adap: the adapter
+ *
+ *	Enables the HW filters for NIC traffic.
+ */
+void t3_enable_filters(adapter_t *adap)
+{
+	t3_set_reg_field(adap, A_TP_IN_CONFIG, F_NICMODE, 0);
+	t3_set_reg_field(adap, A_MC5_DB_CONFIG, 0, F_FILTEREN);
+	t3_set_reg_field(adap, A_TP_GLOBAL_CONFIG, 0, V_FIVETUPLELOOKUP(3));
+	tp_wr_bits_indirect(adap, A_TP_INGRESS_CONFIG, 0, F_LOOKUPEVERYPKT);
+}
+
+/**
  *	pm_num_pages - calculate the number of pages of the payload memory
  *	@mem_size: the size of the payload memory
  *	@pg_size: the size of each payload memory page
@@ -2660,10 +2709,10 @@ static void tp_config(adapter_t *adap, const struct tp_params *p)
 		     F_TCPCHECKSUMOFFLOAD | V_IPTTL(64));
 	t3_write_reg(adap, A_TP_TCP_OPTIONS, V_MTUDEFAULT(576) |
 		     F_MTUENABLE | V_WINDOWSCALEMODE(1) |
-		     V_TIMESTAMPSMODE(0) | V_SACKMODE(1) | V_SACKRX(1));
+		     V_TIMESTAMPSMODE(1) | V_SACKMODE(1) | V_SACKRX(1));
 	t3_write_reg(adap, A_TP_DACK_CONFIG, V_AUTOSTATE3(1) |
 		     V_AUTOSTATE2(1) | V_AUTOSTATE1(0) |
-		     V_BYTETHRESHOLD(16384) | V_MSSTHRESHOLD(2) |
+		     V_BYTETHRESHOLD(26880) | V_MSSTHRESHOLD(2) |
 		     F_AUTOCAREFUL | F_AUTOENABLE | V_DACK_MODE(1));
 	t3_set_reg_field(adap, A_TP_IN_CONFIG, F_RXFBARBPRIO | F_TXFBARBPRIO,
 			 F_IPV6ENABLE | F_NICMODE);
@@ -2705,7 +2754,8 @@ static void tp_config(adapter_t *adap, const struct tp_params *p)
 
 	if (adap->params.nports > 2) {
 		t3_set_reg_field(adap, A_TP_PC_CONFIG2, 0,
-				 F_ENABLETXPORTFROMDA | F_ENABLERXPORTFROMADDR);
+				 F_ENABLETXPORTFROMDA2 | F_ENABLETXPORTFROMDA |
+				 F_ENABLERXPORTFROMADDR);
 		tp_wr_bits_indirect(adap, A_TP_QOS_RX_MAP_MODE,
 				    V_RXMAPMODE(M_RXMAPMODE), 0);
 		tp_wr_indirect(adap, A_TP_INGRESS_CONFIG, V_BITPOS0(48) |
@@ -3620,6 +3670,8 @@ int t3_init_hw(adapter_t *adapter, u32 fw_params)
 	chan_init_hw(adapter, adapter->params.chan_map);
 	t3_sge_init(adapter, &adapter->params.sge);
 
+	t3_write_reg(adapter, A_T3DBG_GPIO_ACT_LOW, calc_gpio_intr(adapter));
+
 	t3_write_reg(adapter, A_CIM_HOST_ACC_DATA, vpd->uclk | fw_params);
 	t3_write_reg(adapter, A_CIM_BOOT_CFG,
 		     V_BOOTADDR(FW_FLASH_BOOT_ADDR >> 2));
diff --git a/sys/dev/cxgb/common/cxgb_tn1010.c b/sys/dev/cxgb/common/cxgb_tn1010.c
new file mode 100644
index 0000000000000..920ccc04a8665
--- /dev/null
+++ b/sys/dev/cxgb/common/cxgb_tn1010.c
@@ -0,0 +1,225 @@
+/**************************************************************************
+
+Copyright (c) 2008, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#endif
+
+#undef msleep
+#define msleep t3_os_sleep
+
+/* TN1010 PHY specific registers. */
+enum {
+	TN1010_VEND1_STAT = 1,
+};
+
+/* IEEE auto-negotiation 10GBASE-T registers */
+enum {
+	ANEG_ADVER    = 16,
+	ANEG_LPA      = 19,
+	ANEG_10G_CTRL = 32,
+	ANEG_10G_STAT = 33
+};
+
+#define ADVERTISE_ENPAGE      (1 << 12)
+#define ADVERTISE_10000FULL   (1 << 12)
+#define ADVERTISE_LOOP_TIMING (1 << 0)
+
+/* vendor specific status register fields */
+#define F_XS_LANE_ALIGN_STAT (1 << 0)
+#define F_PCS_BLK_LOCK       (1 << 1)
+#define F_PMD_SIGNAL_OK      (1 << 2)
+#define F_LINK_STAT          (1 << 3)
+#define F_ANEG_SPEED_1G      (1 << 4)
+#define F_ANEG_MASTER        (1 << 5)
+
+#define S_ANEG_STAT    6
+#define M_ANEG_STAT    0x3
+#define G_ANEG_STAT(x) (((x) >> S_ANEG_STAT) & M_ANEG_STAT)
+
+enum {                        /* autonegotiation status */
+	ANEG_IN_PROGR = 0,
+	ANEG_COMPLETE = 1,
+	ANEG_FAILED   = 3
+};
+
+/*
+ * Reset the PHY.  May take up to 500ms to complete.
+ */
+static int tn1010_reset(struct cphy *phy, int wait)
+{
+	int err = t3_phy_reset(phy, MDIO_DEV_PMA_PMD, wait);
+	msleep(500);
+	return err;
+}
+
+static int tn1010_power_down(struct cphy *phy, int enable)
+{
+	return t3_mdio_change_bits(phy, MDIO_DEV_PMA_PMD, MII_BMCR,
+				   BMCR_PDOWN, enable ? BMCR_PDOWN : 0);
+}
+
+static int tn1010_autoneg_enable(struct cphy *phy)
+{
+	int err;
+
+	err = tn1010_power_down(phy, 0);
+	if (!err)
+		err = t3_mdio_change_bits(phy, MDIO_DEV_ANEG, MII_BMCR, 0,
+					  BMCR_ANENABLE | BMCR_ANRESTART);
+	return err;
+}
+
+static int tn1010_autoneg_restart(struct cphy *phy)
+{
+	int err;
+
+	err = tn1010_power_down(phy, 0);
+	if (!err)
+		err = t3_mdio_change_bits(phy, MDIO_DEV_ANEG, MII_BMCR, 0,
+					  BMCR_ANRESTART);
+	return err;
+}
+
+static int tn1010_advertise(struct cphy *phy, unsigned int advert)
+{
+	int err, val;
+
+	if (!(advert & ADVERTISED_1000baseT_Full))
+		return -EINVAL;               /* PHY can't disable 1000BASE-T */
+
+	val = ADVERTISE_CSMA | ADVERTISE_ENPAGE | ADVERTISE_NPAGE;
+	if (advert & ADVERTISED_Pause)
+		val |= ADVERTISE_PAUSE_CAP;
+	if (advert & ADVERTISED_Asym_Pause)
+		val |= ADVERTISE_PAUSE_ASYM;
+	err = mdio_write(phy, MDIO_DEV_ANEG, ANEG_ADVER, val);
+	if (err)
+		return err;
+
+	val = (advert & ADVERTISED_10000baseT_Full) ? ADVERTISE_10000FULL : 0;
+	return mdio_write(phy, MDIO_DEV_ANEG, ANEG_10G_CTRL, val |
+			  ADVERTISE_LOOP_TIMING);
+}
+
+static int tn1010_get_link_status(struct cphy *phy, int *link_ok,
+				  int *speed, int *duplex, int *fc)
+{
+	unsigned int status, lpa, adv;
+	int err, sp = -1, pause = 0;
+
+	err = mdio_read(phy, MDIO_DEV_VEND1, TN1010_VEND1_STAT, &status);
+	if (err)
+		return err;
+
+	if (link_ok)
+		*link_ok = (status & F_LINK_STAT) != 0;
+
+	if (G_ANEG_STAT(status) == ANEG_COMPLETE) {
+		sp = (status & F_ANEG_SPEED_1G) ? SPEED_1000 : SPEED_10000;
+
+		if (fc) {
+			err = mdio_read(phy, MDIO_DEV_ANEG, ANEG_LPA, &lpa);
+			if (!err)
+				err = mdio_read(phy, MDIO_DEV_ANEG, ANEG_ADVER,
+						&adv);
+			if (err)
+				return err;
+
+			if (lpa & adv & ADVERTISE_PAUSE_CAP)
+				pause = PAUSE_RX | PAUSE_TX;
+			else if ((lpa & ADVERTISE_PAUSE_CAP) &&
+				 (lpa & ADVERTISE_PAUSE_ASYM) &&
+				 (adv & ADVERTISE_PAUSE_ASYM))
+				pause = PAUSE_TX;
+			else if ((lpa & ADVERTISE_PAUSE_ASYM) &&
+				 (adv & ADVERTISE_PAUSE_CAP))
+				pause = PAUSE_RX;
+		}
+	}
+	if (speed)
+		*speed = sp;
+	if (duplex)
+		*duplex = DUPLEX_FULL;
+	if (fc)
+		*fc = pause;
+	return 0;
+}
+
+static int tn1010_set_speed_duplex(struct cphy *phy, int speed, int duplex)
+{
+	return -EINVAL;    /* require autoneg */
+}
+
+#ifdef C99_NOT_SUPPORTED
+static struct cphy_ops tn1010_ops = {
+	tn1010_reset,
+	t3_phy_lasi_intr_enable,
+	t3_phy_lasi_intr_disable,
+	t3_phy_lasi_intr_clear,
+	t3_phy_lasi_intr_handler,
+	tn1010_autoneg_enable,
+	tn1010_autoneg_restart,
+	tn1010_advertise,
+	NULL,
+	tn1010_set_speed_duplex,
+	tn1010_get_link_status,
+	tn1010_power_down,
+};
+#else
+static struct cphy_ops tn1010_ops = {
+	.reset             = tn1010_reset,
+	.intr_enable       = t3_phy_lasi_intr_enable,
+	.intr_disable      = t3_phy_lasi_intr_disable,
+	.intr_clear        = t3_phy_lasi_intr_clear,
+	.intr_handler      = t3_phy_lasi_intr_handler,
+	.autoneg_enable    = tn1010_autoneg_enable,
+	.autoneg_restart   = tn1010_autoneg_restart,
+	.advertise         = tn1010_advertise,
+	.set_speed_duplex  = tn1010_set_speed_duplex,
+	.get_link_status   = tn1010_get_link_status,
+	.power_down        = tn1010_power_down,
+};
+#endif
+
+int t3_tn1010_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
+		       const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &tn1010_ops, mdio_ops,
+		  SUPPORTED_1000baseT_Full | SUPPORTED_10000baseT_Full |
+		  SUPPORTED_Autoneg | SUPPORTED_AUI | SUPPORTED_TP,
+		  "1000/10GBASE-T");
+	msleep(500);    /* PHY needs up to 500ms to start responding to MDIO */
+	return 0;
+}
diff --git a/sys/dev/cxgb/common/cxgb_vsc8211.c b/sys/dev/cxgb/common/cxgb_vsc8211.c
index 61bdc9c7f5ed2..ad3c88e4c99d3 100644
--- a/sys/dev/cxgb/common/cxgb_vsc8211.c
+++ b/sys/dev/cxgb/common/cxgb_vsc8211.c
@@ -45,6 +45,7 @@ enum {
 	VSC8211_EXT_CTRL      = 23,
 	VSC8211_INTR_ENABLE   = 25,
 	VSC8211_INTR_STATUS   = 26,
+	VSC8211_LED_CTRL      = 27,
 	VSC8211_AUX_CTRL_STAT = 28,
 	VSC8211_EXT_PAGE_AXS  = 31,
 };
@@ -393,8 +394,10 @@ int t3_vsc8211_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr,
 	err = mdio_read(phy, 0, VSC8211_EXT_CTRL, &val);
 	if (err)
 		return err;
-	if (val & VSC_CTRL_MEDIA_MODE_HI)
-		return 0;   /* copper interface, done */
+	if (val & VSC_CTRL_MEDIA_MODE_HI) {
+		/* copper interface, just need to configure the LEDs */
+		return mdio_write(phy, 0, VSC8211_LED_CTRL, 0x100);
+	}
 
 	phy->caps = SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg |
 		    SUPPORTED_MII | SUPPORTED_FIBRE | SUPPORTED_IRQ;
diff --git a/sys/dev/cxgb/common/cxgb_xgmac.c b/sys/dev/cxgb/common/cxgb_xgmac.c
index 745cc4b4dd5e2..51a02c25bcb69 100644
--- a/sys/dev/cxgb/common/cxgb_xgmac.c
+++ b/sys/dev/cxgb/common/cxgb_xgmac.c
@@ -44,7 +44,6 @@ __FBSDID("$FreeBSD$");
  * # of exact address filters.  The first one is used for the station address,
  * the rest are available for multicast addresses.
  */
-#define EXACT_ADDR_FILTERS 8
 
 static inline int macidx(const struct cmac *mac)
 {
@@ -159,16 +158,18 @@ int t3_mac_reset(struct cmac *mac)
 		t3_write_reg(adap, A_XGM_TX_CTRL + oft, F_TXEN);
 		t3_write_reg(adap, A_XGM_RX_CTRL + oft, F_RXEN);
 	}
+
 	t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + oft,
 			 V_RXMAXFRAMERSIZE(M_RXMAXFRAMERSIZE),
 			 V_RXMAXFRAMERSIZE(MAX_FRAME_SIZE) | F_RXENFRAMER);
+
 	val = F_MAC_RESET_ | F_XGMAC_STOP_EN;
-	if (is_10G(adap) || mac->multiport)
+	if (!mac->multiport)
+		val |= F_XG2G_RESET_;
+	if (uses_xaui(adap))
 		val |= F_PCS_RESET_;
-	else if (uses_xaui(adap))
-		val |= F_PCS_RESET_ | F_XG2G_RESET_;
 	else
-		val |= F_RGMII_RESET_ | F_XG2G_RESET_;
+		val |= F_RGMII_RESET_;
 	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, val);
 	(void) t3_read_reg(adap, A_XGM_RESET_CTRL + oft);  /* flush */
 	if ((val & F_PCS_RESET_) && adap->params.rev) {
@@ -188,10 +189,10 @@ static int t3b2_mac_reset(struct cmac *mac)
 
 
 	/* Stop egress traffic to xgm*/
-	if (!macidx(mac)) 
-		t3_set_reg_field(adap, A_MPS_CFG, F_PORT0ACTIVE, 0); 
+	if (!macidx(mac))
+		t3_set_reg_field(adap, A_MPS_CFG, F_PORT0ACTIVE, 0);
 	else
-		t3_set_reg_field(adap, A_MPS_CFG, F_PORT1ACTIVE, 0); 
+		t3_set_reg_field(adap, A_MPS_CFG, F_PORT1ACTIVE, 0);
 
 	/* PCS in reset */
 	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, F_MAC_RESET_);
@@ -223,15 +224,15 @@ static int t3b2_mac_reset(struct cmac *mac)
 		msleep(1);
 		t3b_pcs_reset(mac);
 	}
-	t3_write_reg(adap, A_XGM_RX_CFG + oft, 
+	t3_write_reg(adap, A_XGM_RX_CFG + oft,
 		 F_DISPAUSEFRAMES | F_EN1536BFRAMES |
 		                F_RMFCS | F_ENJUMBO | F_ENHASHMCAST );
 
 	/*Resume egress traffic to xgm*/
-	if (!macidx(mac)) 
-		t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT0ACTIVE); 
+	if (!macidx(mac))
+		t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT0ACTIVE);
 	else
-		t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT1ACTIVE); 
+		t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT1ACTIVE);
 
 	return 0;
 }
@@ -279,6 +280,9 @@ int t3_mac_set_address(struct cmac *mac, unsigned int idx, u8 addr[6])
  *	Specify the number of exact address filters that should be reserved for
  *	unicast addresses.  Caller should reload the unicast and multicast
  *	addresses after calling this.
+ *
+ *	Generally, this is 1 with the first one used for the station address,
+ *	and the rest are available for multicast addresses.
  */
 int t3_mac_set_num_ucast(struct cmac *mac, unsigned char n)
 {
@@ -385,7 +389,7 @@ static int rx_fifo_hwm(int mtu)
  *
  *	Sets the MAC MTU and adjusts the FIFO PAUSE watermarks accordingly.
  */
-int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu) 
+int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu)
 {
 	int hwm, lwm, divisor;
 	int ipg;
@@ -413,7 +417,7 @@ int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu)
 
 		reg = adap->params.rev == T3_REV_B2 ?
 			A_XGM_RX_MAX_PKT_SIZE_ERR_CNT : A_XGM_RXFIFO_CFG;
-	
+
 		/* drain RX FIFO */
 		if (t3_wait_op_done(adap, reg + mac->offset,
 				    F_RXFIFO_EMPTY, 1, 20, 5)) {
@@ -428,9 +432,8 @@ int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu)
 		enable_exact_filters(mac);
 	} else
 		t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset,
-		                 V_RXMAXPKTSIZE(M_RXMAXPKTSIZE),
-		                 V_RXMAXPKTSIZE(mtu));
-	
+				 V_RXMAXPKTSIZE(M_RXMAXPKTSIZE),
+				 V_RXMAXPKTSIZE(mtu));
 	/*
 	 * Adjust the PAUSE frame watermarks.  We always set the LWM, and the
 	 * HWM only if flow-control is enabled.
@@ -462,10 +465,10 @@ int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu)
 	 */
 	if (adap->params.rev > 0) {
 		divisor = (adap->params.rev == T3_REV_C) ? 64 : 8;
-		t3_write_reg(adap, A_XGM_PAUSE_TIMER + mac->offset, 
-		    (hwm - lwm) * 4 / divisor);
+		t3_write_reg(adap, A_XGM_PAUSE_TIMER + mac->offset,
+			     (hwm - lwm) * 4 / divisor);
 	}
-	t3_write_reg(adap, A_XGM_TX_PAUSE_QUANTA + mac->offset, 
+	t3_write_reg(adap, A_XGM_TX_PAUSE_QUANTA + mac->offset,
 		     MAC_RXFIFO_SIZE * 4 * 8 / 512);
 	return 0;
 }
@@ -489,7 +492,7 @@ int t3_mac_set_speed_duplex_fc(struct cmac *mac, int speed, int duplex, int fc)
 
 	if (duplex >= 0 && duplex != DUPLEX_FULL)
 		return -EINVAL;
-	if (mac->multiport) {	
+	if (mac->multiport) {
 		val = t3_read_reg(adap, A_XGM_RXFIFO_CFG + oft);
 		val &= ~V_RXFIFOPAUSEHWM(M_RXFIFOPAUSEHWM);
 		val |= V_RXFIFOPAUSEHWM(rx_fifo_hwm(t3_read_reg(adap,
@@ -575,7 +578,7 @@ int t3_mac_enable(struct cmac *mac, int which)
 		mac->txen = F_TXEN;
 		mac->toggle_cnt = 0;
 	}
-	if (which & MAC_DIRECTION_RX) 
+	if (which & MAC_DIRECTION_RX)
 		t3_write_reg(adap, A_XGM_RX_CTRL + oft, F_RXEN);
 	return 0;
 }
@@ -673,10 +676,10 @@ rxcheck:
 	if (rx_mcnt != mac->rx_mcnt) {
 		rx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap,
 						A_XGM_RX_SPI4_SOP_EOP_CNT +
-						mac->offset))) + 
+						mac->offset))) +
 						(s->rx_fifo_ovfl - mac->rx_ocnt);
 		mac->rx_ocnt = s->rx_fifo_ovfl;
-	} else 
+	} else
 		goto out;
 
 	if (mac->rx_mcnt != s->rx_frames && rx_xcnt == 0 && mac->rx_xcnt == 0) {
@@ -684,8 +687,8 @@ rxcheck:
 		  status = 2;
 		goto out;
 	}
-	
-out:	
+
+out:
 	mac->tx_tcnt = tx_tcnt;
 	mac->tx_xcnt = tx_xcnt;
 	mac->tx_mcnt = s->tx_frames;
diff --git a/sys/dev/cxgb/cxgb_adapter.h b/sys/dev/cxgb/cxgb_adapter.h
index f2b0531503910..39fe8eb91f58a 100644
--- a/sys/dev/cxgb/cxgb_adapter.h
+++ b/sys/dev/cxgb/cxgb_adapter.h
@@ -166,7 +166,7 @@ enum { TXQ_ETH = 0,
  * work request size in bytes
  */
 #define WR_LEN (WR_FLITS * 8)
-#define PIO_LEN (WR_LEN - sizeof(struct cpl_tx_pkt))
+#define PIO_LEN (WR_LEN - sizeof(struct cpl_tx_pkt_lso))
 
 
 /* careful, the following are set on priv_flags and must not collide with
diff --git a/sys/dev/cxgb/cxgb_config.h b/sys/dev/cxgb/cxgb_config.h
index 723c23e7279f8..a5ee963b4734f 100644
--- a/sys/dev/cxgb/cxgb_config.h
+++ b/sys/dev/cxgb/cxgb_config.h
@@ -31,7 +31,6 @@ $FreeBSD$
 ***************************************************************************/
 #ifndef _CXGB_CONFIG_H_
 #define _CXGB_CONFIG_H_
-#define DISABLE_MBUF_IOVEC
 #define RTALLOC2_DEFINED
 #define VM_FAULT_HOLD_DEFINED
 #ifndef CONFIG_DEFINED
diff --git a/sys/dev/cxgb/cxgb_main.c b/sys/dev/cxgb/cxgb_main.c
index f6cfcdfbe46e6..4fb53b53efe70 100644
--- a/sys/dev/cxgb/cxgb_main.c
+++ b/sys/dev/cxgb/cxgb_main.c
@@ -9,7 +9,7 @@ modification, are permitted provided that the following conditions are met:
  1. Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
 
-2. Neither the name of the Chelsio Corporation nor the names of its
+ 2. Neither the name of the Chelsio Corporation nor the names of its
     contributors may be used to endorse or promote products derived from
     this software without specific prior written permission.
 
@@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$");
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
+#include <net/if_vlan_var.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
@@ -724,10 +725,9 @@ cxgb_free(struct adapter *sc)
 	} else
 		printf("not offloading set\n");	
 #ifdef notyet
-	/* XXX need to handle unload in TOM */
 	if (sc->flags & CXGB_OFLD_INIT)
 		cxgb_offload_deactivate(sc);
-#endif	
+#endif
 	free(sc->filters, M_DEVBUF);
 	t3_sge_free(sc);
 	
@@ -979,7 +979,7 @@ cxgb_port_attach(device_t dev)
 	 * Only default to jumbo frames on 10GigE
 	 */
 	if (p->adapter->params.nports <= 2)
-		ifp->if_mtu = 9000;
+		ifp->if_mtu = ETHERMTU_JUMBO;
 	if ((err = cxgb_makedev(p)) != 0) {
 		printf("makedev failed %d\n", err);
 		return (err);
@@ -1255,13 +1255,23 @@ cxgb_link_start(struct port_info *p)
 	struct ifnet *ifp;
 	struct t3_rx_mode rm;
 	struct cmac *mac = &p->mac;
+	int mtu, hwtagging;
 
 	ifp = p->ifp;
 
+	bcopy(IF_LLADDR(ifp), p->hw_addr, ETHER_ADDR_LEN);
+
+	mtu = ifp->if_mtu;
+	if (ifp->if_capenable & IFCAP_VLAN_MTU)
+		mtu += ETHER_VLAN_ENCAP_LEN;
+
+	hwtagging = (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0;
+
 	t3_init_rx_mode(&rm, p);
 	if (!mac->multiport) 
 		t3_mac_reset(mac);
-	t3_mac_set_mtu(mac, ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+	t3_mac_set_mtu(mac, mtu);
+	t3_set_vlan_accel(p->adapter, 1 << p->tx_chan, hwtagging);
 	t3_mac_set_address(mac, 0, p->hw_addr);
 	t3_mac_set_rx_mode(mac, &rm);
 	t3_link_start(&p->phy, mac, &p->link_config);
@@ -1751,10 +1761,9 @@ offload_open(struct port_info *pi)
 		     adapter->params.rev == 0 ?
 		       adapter->port[0].ifp->if_mtu : 0xffff);
 	init_smt(adapter);
-#ifdef TOE_ENABLED
 	/* Call back all registered clients */
 	cxgb_add_clients(tdev);
-#endif
+
 	/* restore them in case the offload module has changed them */
 	if (err) {
 		t3_tp_set_offload_mode(adapter, 0);
@@ -1771,10 +1780,10 @@ offload_close(struct t3cdev *tdev)
 
 	if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT))
 		return (0);
-#ifdef TOE_ENABLED	
+
 	/* Call back all registered clients */
 	cxgb_remove_clients(tdev);
-#endif
+
 	tdev->lldev = NULL;
 	cxgb_set_dummy_ops(tdev);
 	t3_tp_set_offload_mode(adapter, 0);
@@ -1904,7 +1913,7 @@ cxgb_set_mtu(struct port_info *p, int mtu)
 	struct ifnet *ifp = p->ifp;
 	int error = 0;
 	
-	if ((mtu < ETHERMIN) || (mtu > ETHER_MAX_LEN_JUMBO))
+	if ((mtu < ETHERMIN) || (mtu > ETHERMTU_JUMBO))
 		error = EINVAL;
 	else if (ifp->if_mtu != mtu) {
 		PORT_LOCK(p);
@@ -1924,7 +1933,7 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data)
 	struct port_info *p = ifp->if_softc;
 	struct ifaddr *ifa = (struct ifaddr *)data;
 	struct ifreq *ifr = (struct ifreq *)data;
-	int flags, error = 0;
+	int flags, error = 0, reinit = 0;
 	uint32_t mask;
 
 	/* 
@@ -1979,18 +1988,16 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data)
 			if (IFCAP_TXCSUM & ifp->if_capenable) {
 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
-				    | CSUM_TSO);
+				    | CSUM_IP | CSUM_TSO);
 			} else {
 				ifp->if_capenable |= IFCAP_TXCSUM;
-				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
-			}
-		} else if (mask & IFCAP_RXCSUM) {
-			if (IFCAP_RXCSUM & ifp->if_capenable) {
-				ifp->if_capenable &= ~IFCAP_RXCSUM;
-			} else {
-				ifp->if_capenable |= IFCAP_RXCSUM;
+				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP
+				    | CSUM_IP);
 			}
 		}
+		if (mask & IFCAP_RXCSUM) {
+			ifp->if_capenable ^= IFCAP_RXCSUM;
+		}
 		if (mask & IFCAP_TSO4) {
 			if (IFCAP_TSO4 & ifp->if_capenable) {
 				ifp->if_capenable &= ~IFCAP_TSO4;
@@ -2005,7 +2012,26 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data)
 				error = EINVAL;
 			}
 		}
+		if (mask & IFCAP_VLAN_HWTAGGING) {
+			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
+			reinit = ifp->if_drv_flags & IFF_DRV_RUNNING;
+		}
+		if (mask & IFCAP_VLAN_MTU) {
+			ifp->if_capenable ^= IFCAP_VLAN_MTU;
+			reinit = ifp->if_drv_flags & IFF_DRV_RUNNING;
+		}
+		if (mask & IFCAP_VLAN_HWCSUM) {
+			ifp->if_capenable ^= IFCAP_VLAN_HWCSUM;
+		}
+		if (reinit) {
+			cxgb_stop_locked(p);
+			cxgb_init_locked(p);
+		}
 		PORT_UNLOCK(p);
+
+#ifdef VLAN_CAPABILITIES
+		VLAN_CAPABILITIES(ifp);
+#endif
 		break;
 	default:
 		error = ether_ioctl(ifp, command, data);
@@ -2126,9 +2152,11 @@ check_t3b2_mac(struct adapter *adapter)
 			p->mac.stats.num_toggled++;
 		else if (status == 2) {
 			struct cmac *mac = &p->mac;
+			int mtu = ifp->if_mtu;
 
-			t3_mac_set_mtu(mac, ifp->if_mtu + ETHER_HDR_LEN
-			    + ETHER_VLAN_ENCAP_LEN);
+			if (ifp->if_capenable & IFCAP_VLAN_MTU)
+				mtu += ETHER_VLAN_ENCAP_LEN;
+			t3_mac_set_mtu(mac, mtu);
 			t3_mac_set_address(mac, 0, p->hw_addr);
 			cxgb_set_rxmode(p);
 			t3_link_start(&p->phy, mac, &p->link_config);
@@ -2434,7 +2462,7 @@ cxgb_extension_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data,
 		if (t->intr_lat >= 0) {
 			struct sge_qset *qs = &sc->sge.qs[t->qset_idx];
 
-			q->coalesce_nsecs = t->intr_lat*1000;
+			q->coalesce_usecs = t->intr_lat;
 			t3_update_qset_coalesce(qs, q);
 		}
 		break;
@@ -2454,7 +2482,7 @@ cxgb_extension_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data,
 		t->fl_size[0]  = q->fl_size;
 		t->fl_size[1]  = q->jumbo_size;
 		t->polling     = q->polling;
-		t->intr_lat    = q->coalesce_nsecs / 1000;
+		t->intr_lat    = q->coalesce_usecs;
 		t->cong_thres  = q->cong_thres;
 		break;
 	}
diff --git a/sys/dev/cxgb/cxgb_offload.c b/sys/dev/cxgb/cxgb_offload.c
index 1eeafafa5b4f5..d865e7f7cbfb1 100644
--- a/sys/dev/cxgb/cxgb_offload.c
+++ b/sys/dev/cxgb/cxgb_offload.c
@@ -1,7 +1,6 @@
-
 /**************************************************************************
 
-Copyright (c) 2007, Chelsio Inc.
+Copyright (c) 2007-2008, Chelsio Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -104,7 +103,7 @@ unregister_tdev(struct t3cdev *tdev)
 	mtx_unlock(&cxgb_db_lock);	
 }
 
-#ifdef TOE_ENABLED
+#ifndef TCP_OFFLOAD_DISABLE
 /**
  *	cxgb_register_client - register an offload client
  *	@client: the client
diff --git a/sys/dev/cxgb/cxgb_offload.h b/sys/dev/cxgb/cxgb_offload.h
index dbe2bc50a4cd8..605dd0b0dc2a8 100644
--- a/sys/dev/cxgb/cxgb_offload.h
+++ b/sys/dev/cxgb/cxgb_offload.h
@@ -36,17 +36,13 @@ $FreeBSD$
 #ifdef CONFIG_DEFINED
 #include <common/cxgb_version.h>
 #include <cxgb_config.h>
-#ifdef TOE_ENABLED
 #include <ulp/tom/cxgb_l2t.h>
-#endif
 #include <common/cxgb_tcb.h>
 #include <t3cdev.h>
 #else
 #include <dev/cxgb/common/cxgb_version.h>
 #include <dev/cxgb/cxgb_config.h>
-#ifdef TOE_ENABLED
 #include <dev/cxgb/ulp/tom/cxgb_l2t.h>
-#endif
 #include <dev/cxgb/common/cxgb_tcb.h>
 #include <dev/cxgb/t3cdev.h>
 #endif
@@ -83,7 +79,6 @@ void cxgb_remove_clients(struct t3cdev *tdev);
 typedef int (*cxgb_cpl_handler_func)(struct t3cdev *dev,
 				      struct mbuf *m, void *ctx);
 
-#ifdef TOE_ENABLED
 struct cxgb_client {
 	char 			*name;
 	void 			(*add) (struct t3cdev *);
@@ -102,7 +97,6 @@ int cxgb_alloc_atid(struct t3cdev *dev, struct cxgb_client *client,
 		     void *ctx);
 int cxgb_alloc_stid(struct t3cdev *dev, struct cxgb_client *client,
 		     void *ctx);
-#endif
 void *cxgb_free_atid(struct t3cdev *dev, int atid);
 void cxgb_free_stid(struct t3cdev *dev, int stid);
 void *cxgb_get_lctx(struct t3cdev *tdev, int stid);
diff --git a/sys/dev/cxgb/cxgb_osdep.h b/sys/dev/cxgb/cxgb_osdep.h
index 7466d8a24be5e..73d7c77ae3cee 100644
--- a/sys/dev/cxgb/cxgb_osdep.h
+++ b/sys/dev/cxgb/cxgb_osdep.h
@@ -55,12 +55,25 @@ $FreeBSD$
 typedef struct adapter adapter_t;
 struct sge_rspq;
 
+enum {
+	TP_TMR_RES = 200,	/* TP timer resolution in usec */
+	MAX_NPORTS = 4,		/* max # of ports */
+	TP_SRAM_OFFSET = 4096,	/* TP SRAM content offset in eeprom */
+	TP_SRAM_LEN = 2112,	/* TP SRAM content offset in eeprom */
+};
 
 struct t3_mbuf_hdr {
 	struct mbuf *mh_head;
 	struct mbuf *mh_tail;
 };
 
+#ifndef PANIC_IF
+#define PANIC_IF(exp) do {                  \
+	if (exp)                            \
+		panic("BUG: %s", #exp);      \
+} while (0)
+#endif
+
 #define m_get_priority(m) ((uintptr_t)(m)->m_pkthdr.rcvif)
 #define m_set_priority(m, pri) ((m)->m_pkthdr.rcvif = (struct ifnet *)((uintptr_t)pri))
 #define m_set_sgl(m, sgl) ((m)->m_pkthdr.header = (sgl))
@@ -127,9 +140,6 @@ void cxgb_log_tcb(struct adapter *sc, unsigned int tid);
 
 
 #define TX_START_MIN_DESC  (TX_MAX_DESC << 2)
-
-
-
 #define TX_START_MAX_DESC (TX_MAX_DESC << 3)    /* maximum number of descriptors
 						 * call to start used per 	 */
 
@@ -159,7 +169,7 @@ void prefetch(void *x)
 extern void kdb_backtrace(void);
 
 #define WARN_ON(condition) do { \
-        if ((condition)!=0) { \
+	if (__predict_false((condition)!=0)) {	\
                 log(LOG_WARNING, "BUG: warning at %s:%d/%s()\n", __FILE__, __LINE__, __FUNCTION__); \
                 kdb_backtrace(); \
         } \
@@ -384,6 +394,9 @@ static const int debug_flags = DBG_RX;
 #define ADVERTISE_1000XPSE_ASYM	ANAR_X_PAUSE_ASYM
 #define ADVERTISE_1000XPAUSE	ANAR_X_PAUSE_SYM
 
+#define ADVERTISE_CSMA		ANAR_CSMA
+#define ADVERTISE_NPAGE		ANAR_NP
+
 
 /* Standard PCI Extended Capaibilities definitions */
 #define PCI_CAP_ID_VPD	0x03
diff --git a/sys/dev/cxgb/cxgb_sge.c b/sys/dev/cxgb/cxgb_sge.c
index 7f9c933854d05..50335aa17bb0f 100644
--- a/sys/dev/cxgb/cxgb_sge.c
+++ b/sys/dev/cxgb/cxgb_sge.c
@@ -394,12 +394,12 @@ t3_sge_prep(adapter_t *adap, struct sge_params *p)
 		struct qset_params *q = p->qset + i;
 
 		if (adap->params.nports > 2) {
-			q->coalesce_nsecs = 50000;
+			q->coalesce_usecs = 50;
 		} else {
 #ifdef INVARIANTS			
-			q->coalesce_nsecs = 10000;
+			q->coalesce_usecs = 10;
 #else
-			q->coalesce_nsecs = 5000;
+			q->coalesce_usecs = 5;
 #endif			
 		}
 		q->polling = adap->params.rev > 0;
@@ -490,7 +490,7 @@ void
 t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
 {
 
-	qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U);
+	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);
 	qs->rspq.polling = 0 /* p->polling */;
 }
 
@@ -1314,6 +1314,10 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
 			cntrl = V_TXPKT_INTF(pi->txpkt_intf);
 			GET_VTAG_MI(cntrl, batchmi);
 			cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
+			if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
+				cntrl |= F_TXPKT_IPCSUM_DIS;
+			if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
+				cntrl |= F_TXPKT_L4CSUM_DIS;
 			cbe->cntrl = htonl(cntrl);
 			cbe->len = htonl(batchmi->mi_len | 0x80000000);
 			cbe->addr = htobe64(segs[i].ds_addr);
@@ -1343,7 +1347,7 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
 		tmpmi = mv->mv_vec;
 		
 		txd->flit[2] = 0;
-		GET_VTAG_MI(cntrl, mi);
+		GET_VTAG(cntrl, m0);
 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
 		hdr->cntrl = htonl(cntrl);
 		mlen = m0->m_pkthdr.len;
@@ -1356,7 +1360,10 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
 
 		if (__predict_false(undersized)) {
 			pkthdr = tmp;
-			dump_mi(mi);
+			if (mi)
+				dump_mi(mi);
+			printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
+			    m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags);
 			panic("discontig packet - fixxorz");
 		} else 
 			pkthdr = m0->m_data;
@@ -1376,12 +1383,39 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
 			    V_LSO_IPHDR_WORDS(ip->ip_hl) |
 			    V_LSO_TCPHDR_WORDS(tcp->th_off);
 		hdr->lso_info = htonl(tso_info);
+
+		if (__predict_false(mlen <= PIO_LEN)) {
+			/* pkt not undersized but fits in PIO_LEN
+			 * Indicates a TSO bug at the higher levels.
+			 */
+			DPRINTF("**5592 Fix** mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x",
+			    m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags);
+			txq_prod(txq, 1, &txqs);
+			m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]);
+			m_freem(m0);
+			m0 = NULL;
+			flits = (mlen + 7) / 8 + 3;
+			hdr->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
+					  V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
+					  F_WR_SOP | F_WR_EOP | txqs.compl);
+			wmb();
+			hdr->wr.wr_lo = htonl(V_WR_LEN(flits) |
+			    V_WR_GEN(txqs.gen) | V_WR_TID(txq->token));
+
+			wr_gen2(txd, txqs.gen);
+			check_ring_tx_db(sc, txq);
+			return (0);
+		}
 		flits = 3;	
 	} else {
 		struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
 
 		GET_VTAG(cntrl, m0);
 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
+		if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP)))
+			cntrl |= F_TXPKT_IPCSUM_DIS;
+		if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP))))
+			cntrl |= F_TXPKT_L4CSUM_DIS;
 		cpl->cntrl = htonl(cntrl);
 		mlen = m0->m_pkthdr.len;
 		cpl->len = htonl(mlen | 0x80000000);
@@ -3223,11 +3257,11 @@ t3_lro_enable(SYSCTL_HANDLER_ARGS)
 }
 
 static int
-t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
+t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS)
 {
 	adapter_t *sc = arg1;
 	struct qset_params *qsp = &sc->params.sge.qset[0]; 
-	int coalesce_nsecs;	
+	int coalesce_usecs;	
 	struct sge_qset *qs;
 	int i, j, err, nqsets = 0;
 	struct mtx *lock;
@@ -3235,25 +3269,25 @@ t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
 	if ((sc->flags & FULL_INIT_DONE) == 0)
 		return (ENXIO);
 		
-	coalesce_nsecs = qsp->coalesce_nsecs;
-        err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
+	coalesce_usecs = qsp->coalesce_usecs;
+        err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req);
 
 	if (err != 0) {
 		return (err);
 	}
-	if (coalesce_nsecs == qsp->coalesce_nsecs)
+	if (coalesce_usecs == qsp->coalesce_usecs)
 		return (0);
 
 	for (i = 0; i < sc->params.nports; i++) 
 		for (j = 0; j < sc->port[i].nqsets; j++)
 			nqsets++;
 
-	coalesce_nsecs = max(100, coalesce_nsecs);
+	coalesce_usecs = max(1, coalesce_usecs);
 
 	for (i = 0; i < nqsets; i++) {
 		qs = &sc->sge.qs[i];
 		qsp = &sc->params.sge.qset[i];
-		qsp->coalesce_nsecs = coalesce_nsecs;
+		qsp->coalesce_usecs = coalesce_usecs;
 		
 		lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock :
 			    &sc->sge.qs[0].rspq.lock;
@@ -3356,8 +3390,8 @@ t3_add_configured_sysctls(adapter_t *sc)
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, 
 	    "intr_coal",
 	    CTLTYPE_INT|CTLFLAG_RW, sc,
-	    0, t3_set_coalesce_nsecs,
-	    "I", "interrupt coalescing timer (ns)");
+	    0, t3_set_coalesce_usecs,
+	    "I", "interrupt coalescing timer (us)");
 
 	for (i = 0; i < sc->params.nports; i++) {
 		struct port_info *pi = &sc->port[i];
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c
new file mode 100644
index 0000000000000..b198904533465
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c
@@ -0,0 +1,294 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/eventhandler.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+
+#include <contrib/rdma/ib_verbs.h>
+
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#endif
+
+/*
+ * XXX :-/
+ * 
+ */
+
+#define idr_init(x)
+
+cxgb_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
+
+static void open_rnic_dev(struct t3cdev *);
+static void close_rnic_dev(struct t3cdev *);
+
+static TAILQ_HEAD( ,iwch_dev) dev_list;
+static struct mtx dev_mutex;
+static eventhandler_tag event_tag;
+
+static void
+rnic_init(struct iwch_dev *rnicp)
+{
+	CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__,  rnicp);
+	idr_init(&rnicp->cqidr);
+	idr_init(&rnicp->qpidr);
+	idr_init(&rnicp->mmidr);
+	mtx_init(&rnicp->lock, "iwch rnic lock", NULL, MTX_DEF|MTX_DUPOK);
+
+	rnicp->attr.vendor_id = 0x168;
+	rnicp->attr.vendor_part_id = 7;
+	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
+	rnicp->attr.max_wrs = (1UL << 24) - 1;
+	rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
+	rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
+	rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
+	rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1;
+	rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
+	rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
+	rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
+	rnicp->attr.mem_pgsizes_bitmask = 0x7FFF;	/* 4KB-128MB */
+	rnicp->attr.can_resize_wq = 0;
+	rnicp->attr.max_rdma_reads_per_qp = 8;
+	rnicp->attr.max_rdma_read_resources =
+	    rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps;
+	rnicp->attr.max_rdma_read_qp_depth = 8;	/* IRD */
+	rnicp->attr.max_rdma_read_depth =
+	    rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps;
+	rnicp->attr.rq_overflow_handled = 0;
+	rnicp->attr.can_modify_ird = 0;
+	rnicp->attr.can_modify_ord = 0;
+	rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1;
+	rnicp->attr.stag0_value = 1;
+	rnicp->attr.zbva_support = 1;
+	rnicp->attr.local_invalidate_fence = 1;
+	rnicp->attr.cq_overflow_detection = 1;
+	return;
+}
+
+static void
+open_rnic_dev(struct t3cdev *tdev)
+{
+	struct iwch_dev *rnicp;
+	static int vers_printed;
+
+	CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__,  tdev);
+	if (!vers_printed++)
+		printf("Chelsio T3 RDMA Driver - version %s\n",
+		       DRV_VERSION);
+	rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
+	if (!rnicp) {
+		printf("Cannot allocate ib device\n");
+		return;
+	}
+	rnicp->rdev.ulp = rnicp;
+	rnicp->rdev.t3cdev_p = tdev;
+
+	mtx_lock(&dev_mutex);
+
+	if (cxio_rdev_open(&rnicp->rdev)) {
+		mtx_unlock(&dev_mutex);
+		printf("Unable to open CXIO rdev\n");
+		ib_dealloc_device(&rnicp->ibdev);
+		return;
+	}
+
+	rnic_init(rnicp);
+
+	TAILQ_INSERT_TAIL(&dev_list, rnicp, entry);
+	mtx_unlock(&dev_mutex);
+
+	if (iwch_register_device(rnicp)) {
+		printf("Unable to register device\n");
+		close_rnic_dev(tdev);
+	}
+#ifdef notyet	
+	printf("Initialized device %s\n",
+	       pci_name(rnicp->rdev.rnic_info.pdev));
+#endif	
+	return;
+}
+
+static void
+close_rnic_dev(struct t3cdev *tdev)
+{
+	struct iwch_dev *dev, *tmp;
+	CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__,  tdev);
+	mtx_lock(&dev_mutex);
+
+	TAILQ_FOREACH_SAFE(dev, &dev_list, entry, tmp) {
+		if (dev->rdev.t3cdev_p == tdev) {
+#ifdef notyet			
+			list_del(&dev->entry);
+			iwch_unregister_device(dev);
+			cxio_rdev_close(&dev->rdev);
+			idr_destroy(&dev->cqidr);
+			idr_destroy(&dev->qpidr);
+			idr_destroy(&dev->mmidr);
+			ib_dealloc_device(&dev->ibdev);
+#endif			
+			break;
+		}
+	}
+	mtx_unlock(&dev_mutex);
+}
+
+static ifaddr_event_handler_t
+ifaddr_event_handler(void *arg, struct ifnet *ifp)
+{
+	printf("%s if name %s \n", __FUNCTION__, ifp->if_xname);
+	if (ifp->if_capabilities & IFCAP_TOE4) {
+		KASSERT(T3CDEV(ifp) != NULL, ("null t3cdev ptr!"));
+		if (cxio_hal_find_rdev_by_t3cdev(T3CDEV(ifp)) == NULL)
+			open_rnic_dev(T3CDEV(ifp));
+	}
+	return 0;
+}
+
+
+static int
+iwch_init_module(void)
+{
+	int err;
+	struct ifnet *ifp;
+
+	printf("%s enter\n", __FUNCTION__);
+	TAILQ_INIT(&dev_list);
+	mtx_init(&dev_mutex, "iwch dev_list lock", NULL, MTX_DEF);
+	
+	err = cxio_hal_init();
+	if (err)
+		return err;
+	err = iwch_cm_init();
+	if (err)
+		return err;
+	cxio_register_ev_cb(iwch_ev_dispatch);
+
+	/* Register for ifaddr events to dynamically add TOE devs */
+	event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_event_handler,
+			NULL, EVENTHANDLER_PRI_ANY);
+
+	/* Register existing TOE interfaces by walking the ifnet chain */
+	IFNET_RLOCK();
+	TAILQ_FOREACH(ifp, &ifnet, if_link) {
+		(void)ifaddr_event_handler(NULL, ifp);
+	}
+	IFNET_RUNLOCK();
+	return 0;
+}
+
+static void
+iwch_exit_module(void)
+{
+	EVENTHANDLER_DEREGISTER(ifaddr_event, event_tag);
+	cxio_unregister_ev_cb(iwch_ev_dispatch);
+	iwch_cm_term();
+	cxio_hal_exit();
+}
+
+static int 
+iwch_load(module_t mod, int cmd, void *arg)
+{
+        int err = 0;
+
+        switch (cmd) {
+        case MOD_LOAD:
+                printf("Loading iw_cxgb.\n");
+
+                iwch_init_module();
+                break;
+        case MOD_QUIESCE:
+                break;
+        case MOD_UNLOAD:
+                printf("Unloading iw_cxgb.\n");
+		iwch_exit_module();
+                break;
+        case MOD_SHUTDOWN:
+                break;
+        default:
+                err = EOPNOTSUPP;
+                break;
+        }
+
+        return (err);
+}
+
+static moduledata_t mod_data = {
+	"iw_cxgb",
+	iwch_load,
+	0
+};
+
+MODULE_VERSION(iw_cxgb, 1);
+DECLARE_MODULE(iw_cxgb, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
+MODULE_DEPEND(iw_cxgb, rdma_core, 1, 1, 1);
+MODULE_DEPEND(iw_cxgb, if_cxgb, 1, 1, 1);
+MODULE_DEPEND(iw_cxgb, t3_tom, 1, 1, 1);
+
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h
new file mode 100644
index 0000000000000..f4b28566ebf5b
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h
@@ -0,0 +1,168 @@
+/**************************************************************************
+
+Copyright (c) 2007, 2008 Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD$
+
+***************************************************************************/
+
+#ifndef __IWCH_H__
+#define __IWCH_H__
+
+struct iwch_pd;
+struct iwch_cq;
+struct iwch_qp;
+struct iwch_mr;
+
+
+struct iwch_rnic_attributes {
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 max_qps;
+	u32 max_wrs;				/* Max for any SQ/RQ */
+	u32 max_sge_per_wr;
+	u32 max_sge_per_rdma_write_wr;	/* for RDMA Write WR */
+	u32 max_cqs;
+	u32 max_cqes_per_cq;
+	u32 max_mem_regs;
+	u32 max_phys_buf_entries;		/* for phys buf list */
+	u32 max_pds;
+
+	/*
+	 * The memory page sizes supported by this RNIC.
+	 * Bit position i in bitmap indicates page of
+	 * size (4k)^i.  Phys block list mode unsupported.
+	 */
+	u32 mem_pgsizes_bitmask;
+	u8 can_resize_wq;
+
+	/*
+	 * The maximum number of RDMA Reads that can be outstanding
+	 * per QP with this RNIC as the target.
+	 */
+	u32 max_rdma_reads_per_qp;
+
+	/*
+	 * The maximum number of resources used for RDMA Reads
+	 * by this RNIC with this RNIC as the target.
+	 */
+	u32 max_rdma_read_resources;
+
+	/*
+	 * The max depth per QP for initiation of RDMA Read
+	 * by this RNIC.
+	 */
+	u32 max_rdma_read_qp_depth;
+
+	/*
+	 * The maximum depth for initiation of RDMA Read
+	 * operations by this RNIC on all QPs
+	 */
+	u32 max_rdma_read_depth;
+	u8 rq_overflow_handled;
+	u32 can_modify_ird;
+	u32 can_modify_ord;
+	u32 max_mem_windows;
+	u32 stag0_value;
+	u8 zbva_support;
+	u8 local_invalidate_fence;
+	u32 cq_overflow_detection;
+};
+
+struct iwch_dev {
+	struct ib_device ibdev;
+	struct cxio_rdev rdev;
+	u32 device_cap_flags;
+	struct iwch_rnic_attributes attr;
+	struct kvl cqidr;
+	struct kvl qpidr;
+	struct kvl mmidr;
+	struct mtx lock;
+	TAILQ_ENTRY(iwch_dev) entry;
+};
+
+#ifndef container_of
+#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field)))
+#endif
+
+static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct iwch_dev, ibdev);
+}
+
+static inline int t3b_device(const struct iwch_dev *rhp)
+{
+	return rhp->rdev.t3cdev_p->type == T3B;
+}
+
+static inline int t3a_device(const struct iwch_dev *rhp)
+{
+	return rhp->rdev.t3cdev_p->type == T3A;
+}
+
+static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid)
+{
+	return kvl_lookup(&rhp->cqidr, cqid);
+}
+
+static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid)
+{
+	return kvl_lookup(&rhp->qpidr, qpid);
+}
+
+static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid)
+{
+	return kvl_lookup(&rhp->mmidr, mmid);
+}
+
+static inline int insert_handle(struct iwch_dev *rhp, struct kvl *kvlp,
+				void *handle, u32 id)
+{
+	int ret;
+	u32 newid;
+
+	do {
+		mtx_lock(&rhp->lock);
+		ret = kvl_alloc_above(kvlp, handle, id, &newid);
+		WARN_ON(ret != 0);
+		WARN_ON(!ret && newid != id);
+		mtx_unlock(&rhp->lock);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+static inline void remove_handle(struct iwch_dev *rhp, struct kvl *kvlp, u32 id)
+{
+	mtx_lock(&rhp->lock);
+	kvl_delete(kvlp, id);
+	mtx_unlock(&rhp->lock);
+}
+
+extern struct cxgb_client t3c_client;
+extern cxgb_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
+extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m);
+#endif
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c
new file mode 100644
index 0000000000000..cec461147ccc7
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c
@@ -0,0 +1,1779 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <net/route.h>
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcpip.h>
+
+#include <contrib/rdma/ib_verbs.h>
+
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/tom/cxgb_tom.h>
+#include <ulp/tom/cxgb_t3_ddp.h>
+#include <ulp/tom/cxgb_defs.h>
+#include <ulp/tom/cxgb_toepcb.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#endif
+
+#ifdef KTR
+static char *states[] = {
+	"idle",
+	"listen",
+	"connecting",
+	"mpa_wait_req",
+	"mpa_req_sent",
+	"mpa_req_rcvd",
+	"mpa_rep_sent",
+	"fpdu_mode",
+	"aborting",
+	"closing",
+	"moribund",
+	"dead",
+	NULL,
+};
+#endif
+
+SYSCTL_NODE(_hw, OID_AUTO, cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters");
+
+static int ep_timeout_secs = 10;
+TUNABLE_INT("hw.iw_cxgb.ep_timeout_secs", &ep_timeout_secs);
+SYSCTL_UINT(_hw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RDTUN, &ep_timeout_secs, 0,
+    "CM Endpoint operation timeout in seconds (default=10)");
+
+static int mpa_rev = 1;
+TUNABLE_INT("hw.iw_cxgb.mpa_rev", &mpa_rev);
+SYSCTL_UINT(_hw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RDTUN, &mpa_rev, 0,
+    "MPA Revision, 0 supports amso1100, 1 is spec compliant. (default=1)");
+
+static int markers_enabled = 0;
+TUNABLE_INT("hw.iw_cxgb.markers_enabled", &markers_enabled);
+SYSCTL_UINT(_hw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RDTUN, &markers_enabled, 0,
+    "Enable MPA MARKERS (default(0)=disabled)");
+
+static int crc_enabled = 1;
+TUNABLE_INT("hw.iw_cxgb.crc_enabled", &crc_enabled);
+SYSCTL_UINT(_hw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RDTUN, &crc_enabled, 0,
+    "Enable MPA CRC (default(1)=enabled)");
+
+static int rcv_win = 256 * 1024;
+TUNABLE_INT("hw.iw_cxgb.rcv_win", &rcv_win);
+SYSCTL_UINT(_hw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RDTUN, &rcv_win, 0,
+    "TCP receive window in bytes (default=256KB)");
+
+static int snd_win = 32 * 1024;
+TUNABLE_INT("hw.iw_cxgb.snd_win", &snd_win);
+SYSCTL_UINT(_hw_cxgb, OID_AUTO, snd_win, CTLFLAG_RDTUN, &snd_win, 0,
+    "TCP send window in bytes (default=32KB)");
+
+static unsigned int nocong = 0;
+TUNABLE_INT("hw.iw_cxgb.nocong", &nocong);
+SYSCTL_UINT(_hw_cxgb, OID_AUTO, nocong, CTLFLAG_RDTUN, &nocong, 0,
+    "Turn off congestion control (default=0)");
+
+static unsigned int cong_flavor = 1;
+TUNABLE_INT("hw.iw_cxgb.cong_flavor", &cong_flavor);
+SYSCTL_UINT(_hw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RDTUN, &cong_flavor, 0,
+    "TCP Congestion control flavor (default=1)");
+
+static void ep_timeout(void *arg);
+static void connect_reply_upcall(struct iwch_ep *ep, int status);
+static void iwch_so_upcall(struct socket *so, void *arg, int waitflag);
+
+/*
+ * Cruft to offload socket upcalls onto thread.
+ */
+static struct mtx req_lock;
+static TAILQ_HEAD(iwch_ep_list, iwch_ep_common) req_list;
+static struct task iw_cxgb_task;
+static struct taskqueue *iw_cxgb_taskq;
+static void process_req(void *ctx, int pending);
+
+static void
+start_ep_timer(struct iwch_ep *ep)
+{
+	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
+	if (callout_pending(&ep->timer)) {
+		CTR2(KTR_IW_CXGB, "%s stopped / restarted timer ep %p", __FUNCTION__, ep);
+		callout_deactivate(&ep->timer);
+		callout_drain(&ep->timer);
+	} else {
+		/*
+		 * XXX this looks racy
+		 */
+		get_ep(&ep->com);
+		callout_init(&ep->timer, TRUE);
+	}
+	callout_reset(&ep->timer, ep_timeout_secs * hz, ep_timeout, ep);
+}
+
+static void
+stop_ep_timer(struct iwch_ep *ep)
+{
+	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
+	callout_drain(&ep->timer);
+	put_ep(&ep->com);
+}
+
+static int set_tcpinfo(struct iwch_ep *ep)
+{
+	struct tcp_info ti;
+	struct sockopt sopt;
+	int err;
+
+	sopt.sopt_dir = SOPT_GET;
+	sopt.sopt_level = IPPROTO_TCP;
+	sopt.sopt_name = TCP_INFO;
+	sopt.sopt_val = (caddr_t)&ti;
+	sopt.sopt_valsize = sizeof ti;
+	sopt.sopt_td = NULL;
+	
+	err = sogetopt(ep->com.so, &sopt);
+	if (err) {
+		printf("%s can't get tcpinfo\n", __FUNCTION__);
+		return -err;
+	}
+	if (!(ti.tcpi_options & TCPI_OPT_TOE)) {
+		printf("%s connection NOT OFFLOADED!\n", __FUNCTION__);
+		return -EINVAL;
+	}
+
+	ep->snd_seq = ti.tcpi_snd_nxt;
+	ep->rcv_seq = ti.tcpi_rcv_nxt;
+	ep->emss = ti.__tcpi_snd_mss - sizeof(struct tcpiphdr);
+	ep->hwtid = TOEPCB(ep->com.so)->tp_tid; /* XXX */
+	if (ti.tcpi_options & TCPI_OPT_TIMESTAMPS)
+		ep->emss -= 12;
+	if (ep->emss < 128)
+		ep->emss = 128;
+	return 0;
+}
+
+static enum iwch_ep_state
+state_read(struct iwch_ep_common *epc)
+{
+	enum iwch_ep_state state;
+
+	mtx_lock(&epc->lock);
+	state = epc->state;
+	mtx_unlock(&epc->lock);
+	return state;
+}
+
+static void
+__state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
+{
+	epc->state = new;
+}
+
+static void
+state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
+{
+
+	mtx_lock(&epc->lock);
+	CTR3(KTR_IW_CXGB, "%s - %s -> %s", __FUNCTION__, states[epc->state], states[new]);
+	__state_set(epc, new);
+	mtx_unlock(&epc->lock);
+	return;
+}
+
+static void *
+alloc_ep(int size, int flags)
+{
+	struct iwch_ep_common *epc;
+
+	epc = malloc(size, M_DEVBUF, flags);
+	if (epc) {
+		memset(epc, 0, size);
+		refcount_init(&epc->refcount, 1);
+		mtx_init(&epc->lock, "iwch_epc lock", NULL, MTX_DEF|MTX_DUPOK);
+		cv_init(&epc->waitq, "iwch_epc cv");
+	}
+	CTR2(KTR_IW_CXGB, "%s alloc ep %p", __FUNCTION__, epc);
+	return epc;
+}
+
+void __free_ep(struct iwch_ep_common *epc)
+{
+	CTR3(KTR_IW_CXGB, "%s ep %p state %s", __FUNCTION__, epc, states[state_read(epc)]);
+	KASSERT(!epc->so, ("%s warning ep->so %p \n", __FUNCTION__, epc->so));
+	KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list!\n", __FUNCTION__, epc));
+	free(epc, M_DEVBUF);
+}
+
+int
+iwch_quiesce_tid(struct iwch_ep *ep)
+{
+#ifdef notyet
+	struct cpl_set_tcb_field *req;
+	struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT);
+
+	if (m == NULL)
+		return (-ENOMEM);
+	req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_RX_QUIESCE);
+	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+	req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE);
+
+	m_set_priority(m, CPL_PRIORITY_DATA); 
+	cxgb_ofld_send(ep->com.tdev, m);
+#endif
+	return 0;
+}
+
+int
+iwch_resume_tid(struct iwch_ep *ep)
+{
+#ifdef notyet
+	struct cpl_set_tcb_field *req;
+	struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT);
+
+	if (m == NULL)
+		return (-ENOMEM);
+	req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_RX_QUIESCE);
+	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+	req->val = 0;
+
+	m_set_priority(m, CPL_PRIORITY_DATA);
+	cxgb_ofld_send(ep->com.tdev, m);
+#endif
+	return 0;
+}
+
+static struct rtentry *
+find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port,
+    __be16 peer_port, u8 tos)
+{
+        struct route iproute;
+        struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst;
+ 
+        bzero(&iproute, sizeof iproute);
+	dst->sin_family = AF_INET;
+	dst->sin_len = sizeof *dst;
+        dst->sin_addr.s_addr = peer_ip;
+ 
+        rtalloc(&iproute);
+	return iproute.ro_rt;
+}
+
+static void
+close_socket(struct iwch_ep_common *epc)
+{
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]);
+	SOCK_LOCK(epc->so);
+	epc->so->so_upcall = NULL;
+	epc->so->so_upcallarg = NULL;
+	epc->so->so_rcv.sb_flags &= ~SB_UPCALL;
+	SOCK_UNLOCK(epc->so);
+	soshutdown(epc->so, SHUT_WR|SHUT_RD);
+	epc->so = NULL;
+}
+
+static void
+shutdown_socket(struct iwch_ep_common *epc)
+{
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]);
+	soshutdown(epc->so, SHUT_WR);
+}
+
+static void
+abort_socket(struct iwch_ep *ep)
+{
+	struct sockopt sopt;
+	int err;
+	struct linger l;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	l.l_onoff = 1;
+	l.l_linger = 0;
+
+	/* linger_time of 0 forces RST to be sent */
+	sopt.sopt_dir = SOPT_SET;
+	sopt.sopt_level = SOL_SOCKET;
+	sopt.sopt_name = SO_LINGER;
+	sopt.sopt_val = (caddr_t)&l;
+	sopt.sopt_valsize = sizeof l;
+	sopt.sopt_td = NULL;
+	err = sosetopt(ep->com.so, &sopt);
+	if (err) 
+		printf("%s can't set linger to 0, no RST! err %d\n", __FUNCTION__, err);
+}
+
+static void
+send_mpa_req(struct iwch_ep *ep)
+{
+	int mpalen;
+	struct mpa_message *mpa;
+	struct mbuf *m;
+	int err;
+
+	CTR3(KTR_IW_CXGB, "%s ep %p pd_len %d", __FUNCTION__, ep, ep->plen);
+
+	mpalen = sizeof(*mpa) + ep->plen;
+	m = m_gethdr(mpalen, M_NOWAIT);
+	if (m == NULL) {
+		connect_reply_upcall(ep, -ENOMEM);
+		return;
+	}
+	mpa = mtod(m, struct mpa_message *);
+	m->m_len = mpalen;
+	m->m_pkthdr.len = mpalen;
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
+	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->private_data_size = htons(ep->plen);
+	mpa->revision = mpa_rev;
+	if (ep->plen)
+		memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen);
+
+	err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread);
+	if (err) {
+		m_freem(m);
+		connect_reply_upcall(ep, -ENOMEM);
+		return;
+	}
+		
+	start_ep_timer(ep);
+	state_set(&ep->com, MPA_REQ_SENT);
+	return;
+}
+
+static int
+send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen;
+	struct mpa_message *mpa;
+	struct mbuf *m;
+	int err;
+
+	CTR3(KTR_IW_CXGB, "%s ep %p plen %d", __FUNCTION__, ep, plen);
+
+	mpalen = sizeof(*mpa) + plen;
+
+	m = m_gethdr(mpalen, M_NOWAIT);
+	if (m == NULL) {
+		printf("%s - cannot alloc mbuf!\n", __FUNCTION__);
+		return (-ENOMEM);
+	}
+	mpa = mtod(m, struct mpa_message *);
+	m->m_len = mpalen;
+	m->m_pkthdr.len = mpalen;
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = MPA_REJECT;
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+	err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread);
+	PANIC_IF(err);
+	return 0;
+}
+
+static int
+send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen;
+	struct mpa_message *mpa;
+	struct mbuf *m;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p plen %d", __FUNCTION__, ep, ep->com.so, plen);
+
+	mpalen = sizeof(*mpa) + plen;
+
+	m = m_gethdr(mpalen, M_NOWAIT);
+	if (m == NULL) {
+		printf("%s - cannot alloc mbuf!\n", __FUNCTION__);
+		return (-ENOMEM);
+	}
+	mpa = mtod(m, struct mpa_message *);
+	m->m_len = mpalen;
+	m->m_pkthdr.len = mpalen;
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+
+	state_set(&ep->com, MPA_REP_SENT);
+	return sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, 
+		ep->com.thread);
+}
+
+static void
+close_complete_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	if (ep->com.cm_id) {
+		CTR3(KTR_IW_CXGB, "close complete delivered ep %p cm_id %p tid %d",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void
+abort_connection(struct iwch_ep *ep)
+{
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	state_set(&ep->com, ABORTING);
+	abort_socket(ep);
+	close_socket(&ep->com);
+	close_complete_upcall(ep);
+	state_set(&ep->com, DEAD);
+	put_ep(&ep->com);
+}
+
+static void
+peer_close_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_DISCONNECT;
+	if (ep->com.cm_id) {
+		CTR3(KTR_IW_CXGB, "peer close delivered ep %p cm_id %p tid %d",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static void
+peer_abort_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = ECONNRESET;
+	if (ep->com.cm_id) {
+		CTR3(KTR_IW_CXGB, "abort delivered ep %p cm_id %p tid %d", ep,
+		     ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void
+connect_reply_upcall(struct iwch_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], status);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REPLY;
+	event.status = status;
+	event.local_addr = ep->com.local_addr;
+	event.remote_addr = ep->com.remote_addr;
+
+	if ((status == 0) || (status == ECONNREFUSED)) {
+		event.private_data_len = ep->plen;
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	}
+	if (ep->com.cm_id) {
+		CTR4(KTR_IW_CXGB, "%s ep %p tid %d status %d", __FUNCTION__, ep,
+		     ep->hwtid, status);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+	if (status < 0) {
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void
+connect_request_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	event.local_addr = ep->com.local_addr;
+	event.remote_addr = ep->com.remote_addr;
+	event.private_data_len = ep->plen;
+	event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	event.provider_data = ep;
+	event.so = ep->com.so;
+	if (state_read(&ep->parent_ep->com) != DEAD)
+		ep->parent_ep->com.cm_id->event_handler(
+						ep->parent_ep->com.cm_id,
+						&event);
+	put_ep(&ep->parent_ep->com);
+	ep->parent_ep = NULL;
+}
+
+static void
+established_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_ESTABLISHED;
+	if (ep->com.cm_id) {
+		CTR3(KTR_IW_CXGB, "%s ep %p tid %d", __FUNCTION__, ep, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static void
+process_mpa_reply(struct iwch_ep *ep)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+	struct iwch_qp_attributes attrs;
+	enum iwch_qp_attr_mask mask;
+	int err;
+	struct mbuf *top, *m;
+	int flags = MSG_DONTWAIT;
+	struct uio uio;
+	int len;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+
+	/*
+	 * Stop mpa timer.  If it expired, then the state has
+	 * changed and we bail since ep_timeout already aborted
+	 * the connection.
+	 */
+	stop_ep_timer(ep);
+	if (state_read(&ep->com) != MPA_REQ_SENT)
+		return;
+
+	uio.uio_resid = len = 1000000;
+	uio.uio_td = ep->com.thread;
+	err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags);
+	if (err) {
+		if (err == EWOULDBLOCK) {
+			start_ep_timer(ep);
+			return;
+		}
+		err = -err;
+		goto err;
+	}
+
+	if (ep->com.so->so_rcv.sb_mb) {
+		printf("%s data after soreceive called! so %p sb_mb %p top %p\n", 
+			__FUNCTION__, ep->com.so, ep->com.so->so_rcv.sb_mb, top);
+	}
+		
+	m = top;
+	do {
+		/*
+		 * If we get more than the supported amount of private data
+		 * then we must fail this connection.
+		 */
+		if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) {
+			err = (-EINVAL);
+			goto err;
+		}
+
+		/*
+		 * copy the new data into our accumulation buffer.
+		 */
+		m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len]));
+		ep->mpa_pkt_len += m->m_len;
+		if (!m->m_next)
+			m = m->m_nextpkt;
+		else
+			m = m->m_next;
+	} while (m);
+
+	m_freem(top);
+
+	/*
+	 * if we don't even have the mpa message, then bail.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+	mpa = (struct mpa_message *)ep->mpa_pkt;
+
+	/* Validate MPA header. */
+	if (mpa->revision != mpa_rev) {
+		CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision);
+		err = EPROTO;
+		goto err;
+	}
+	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
+		CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key);
+		err = EPROTO;
+		goto err;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen);
+		err = EPROTO;
+		goto err;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		CTR2(KTR_IW_CXGB, "%s pkt too big %d", __FUNCTION__, ep->mpa_pkt_len);
+		err = EPROTO;
+		goto err;
+	}
+
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	if (mpa->flags & MPA_REJECT) {
+		err = ECONNREFUSED;
+		goto err;
+	}
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data. And
+	 * the MPA header is valid.
+	 */
+	CTR1(KTR_IW_CXGB, "%s mpa rpl looks good!", __FUNCTION__);
+	state_set(&ep->com, FPDU_MODE);
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	if (set_tcpinfo(ep)) {
+		printf("%s set_tcpinfo error\n", __FUNCTION__);
+		goto err;
+	}
+	CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d", __FUNCTION__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = IWCH_QP_STATE_RTS;
+
+	mask = IWCH_QP_ATTR_NEXT_STATE |
+	    IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR |
+	    IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD;
+
+	/* bind QP and TID with INIT_WR */
+	err = iwch_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (!err)
+		goto out;
+err:
+	abort_connection(ep);
+out:
+	connect_reply_upcall(ep, err);
+	return;
+}
+
+static void
+process_mpa_request(struct iwch_ep *ep)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+	int flags = MSG_DONTWAIT;
+	struct mbuf *top, *m;
+	int err;
+	struct uio uio;
+	int len;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+
+	/*
+	 * Stop mpa timer.  If it expired, then the state has
+	 * changed and we bail since ep_timeout already aborted
+	 * the connection.
+	 */
+	stop_ep_timer(ep);
+	if (state_read(&ep->com) != MPA_REQ_WAIT)
+		return;
+
+	uio.uio_resid = len = 1000000;
+	uio.uio_td = ep->com.thread;
+	err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags);
+	if (err) {
+		if (err == EWOULDBLOCK) {
+			start_ep_timer(ep);
+			return;
+		}
+		err = -err;
+		goto err;
+	}
+
+	m = top;
+	do {
+
+		/*
+		 * If we get more than the supported amount of private data
+		 * then we must fail this connection.
+		 */
+		if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) {
+			CTR2(KTR_IW_CXGB, "%s mpa message too big %d", __FUNCTION__, 
+				ep->mpa_pkt_len + m->m_len);
+			goto err;
+		}
+
+
+		/*
+		 * Copy the new data into our accumulation buffer.
+		 */
+		m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len]));
+		ep->mpa_pkt_len += m->m_len;
+
+		if (!m->m_next)
+			m = m->m_nextpkt;
+		else
+			m = m->m_next;
+	} while (m);
+
+	m_freem(top);
+
+	/*
+	 * If we don't even have the mpa message, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa)) {
+		start_ep_timer(ep);
+		CTR2(KTR_IW_CXGB, "%s not enough header %d...waiting...", __FUNCTION__, 
+			ep->mpa_pkt_len);
+		return;
+	}
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/*
+	 * Validate MPA Header.
+	 */
+	if (mpa->revision != mpa_rev) {
+		CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision);
+		goto err;
+	}
+
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
+		CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key);
+		goto err;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen);
+		goto err;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		CTR2(KTR_IW_CXGB, "%s more data after private data %d", __FUNCTION__, 
+			ep->mpa_pkt_len);
+		goto err;
+	}
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) {
+		start_ep_timer(ep);
+		CTR2(KTR_IW_CXGB, "%s more mpa msg to come %d", __FUNCTION__, 
+			ep->mpa_pkt_len);
+		return;
+	}
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data.
+	 */
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	if (set_tcpinfo(ep)) {
+		printf("%s set_tcpinfo error\n", __FUNCTION__);
+		goto err;
+	}
+	CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d", __FUNCTION__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+	state_set(&ep->com, MPA_REQ_RCVD);
+
+	/* drive upcall */
+	connect_request_upcall(ep);
+	return;
+err:
+	abort_connection(ep);
+	return;
+}
+
+static void
+process_peer_close(struct iwch_ep *ep)
+{
+	struct iwch_qp_attributes attrs;
+	int disconnect = 1;
+	int release = 0;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+
+	mtx_lock(&ep->com.lock);
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, CLOSING);
+		break;
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, CLOSING);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR.
+		 */
+		__state_set(&ep->com, CLOSING);
+		get_ep(&ep->com);
+		break;
+	case MPA_REP_SENT:
+		__state_set(&ep->com, CLOSING);
+		break;
+	case FPDU_MODE:
+		start_ep_timer(ep);
+		__state_set(&ep->com, CLOSING);
+		attrs.next_state = IWCH_QP_STATE_CLOSING;
+		iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+		peer_close_upcall(ep);
+		break;
+	case ABORTING:
+		disconnect = 0;
+		break;
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		disconnect = 0;
+		break;
+	case MORIBUND:
+		stop_ep_timer(ep);
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_IDLE;
+			iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+		}
+		close_socket(&ep->com);
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		disconnect = 0;
+		break;
+	case DEAD:
+		disconnect = 0;
+		break;
+	default:
+		PANIC_IF(1);
+	}
+	mtx_unlock(&ep->com.lock);
+	if (disconnect)
+		iwch_ep_disconnect(ep, 0, M_NOWAIT);
+	if (release)
+		put_ep(&ep->com);
+	return;
+}
+
+static void
+process_conn_error(struct iwch_ep *ep)
+{
+	struct iwch_qp_attributes attrs;
+	int ret;
+	int state;
+
+	state = state_read(&ep->com);
+	CTR5(KTR_IW_CXGB, "%s ep %p so %p so->so_error %u state %s", __FUNCTION__, ep, ep->com.so, ep->com.so->so_error, states[ep->com.state]);
+	switch (state) {
+	case MPA_REQ_WAIT:
+		stop_ep_timer(ep);
+		break;
+	case MPA_REQ_SENT:
+		stop_ep_timer(ep);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REP_SENT:
+		ep->com.rpl_err = ECONNRESET;
+		CTR1(KTR_IW_CXGB, "waking up ep %p", ep);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR.
+		 */
+		get_ep(&ep->com);
+		break;
+	case MORIBUND:
+	case CLOSING:
+		stop_ep_timer(ep);
+		/*FALLTHROUGH*/
+	case FPDU_MODE:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_ERROR;
+			ret = iwch_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+			if (ret)
+				log(LOG_ERR,
+				       "%s - qp <- error failed!\n",
+				       __FUNCTION__);
+		}
+		peer_abort_upcall(ep);
+		break;
+	case ABORTING:
+		break;
+	case DEAD:
+		CTR2(KTR_IW_CXGB, "%s so_error %d IN DEAD STATE!!!!", __FUNCTION__, 
+			ep->com.so->so_error);
+		return;
+	default:
+		PANIC_IF(1);
+		break;
+	}
+
+	if (state != ABORTING) {
+		close_socket(&ep->com);
+		state_set(&ep->com, DEAD);
+		put_ep(&ep->com);
+	}
+	return;
+}
+
+static void
+process_close_complete(struct iwch_ep *ep)
+{
+	struct iwch_qp_attributes attrs;
+	int release = 0;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	PANIC_IF(!ep);
+
+	/* The cm_id may be null if we failed to connect */
+	mtx_lock(&ep->com.lock);
+	switch (ep->com.state) {
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		break;
+	case MORIBUND:
+		stop_ep_timer(ep);
+		if ((ep->com.cm_id) && (ep->com.qp)) {
+			attrs.next_state = IWCH_QP_STATE_IDLE;
+			iwch_modify_qp(ep->com.qp->rhp,
+					     ep->com.qp,
+					     IWCH_QP_ATTR_NEXT_STATE,
+					     &attrs, 1);
+		}
+		close_socket(&ep->com);
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	case ABORTING:
+		break;
+	case DEAD:
+	default:
+		PANIC_IF(1);
+		break;
+	}
+	mtx_unlock(&ep->com.lock);
+	if (release)
+		put_ep(&ep->com);
+	return;
+}
+
+/*
+ * T3A does 3 things when a TERM is received:
+ * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet
+ * 2) generate an async event on the QP with the TERMINATE opcode
+ * 3) post a TERMINATE opcde cqe into the associated CQ.
+ *
+ * For (1), we save the message in the qp for later consumer consumption.
+ * For (2), we move the QP into TERMINATE, post a QP event and disconnect.
+ * For (3), we toss the CQE in cxio_poll_cq().
+ *
+ * terminate() handles case (1)...
+ */
+static int
+terminate(struct t3cdev *tdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct socket *so = toeptoso(toep);
+	struct iwch_ep *ep = so->so_upcallarg;
+
+	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
+	m_adj(m, sizeof(struct cpl_rdma_terminate));
+	CTR2(KTR_IW_CXGB, "%s saving %d bytes of term msg", __FUNCTION__, m->m_len);
+	m_copydata(m, 0, m->m_len, ep->com.qp->attr.terminate_buffer);
+	ep->com.qp->attr.terminate_msg_len = m->m_len;
+	ep->com.qp->attr.is_terminate_local = 0;
+	return CPL_RET_BUF_DONE;
+}
+
+static int
+ec_status(struct t3cdev *tdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct socket *so = toeptoso(toep);
+	struct cpl_rdma_ec_status *rep = cplhdr(m);
+	struct iwch_ep *ep;
+	struct iwch_qp_attributes attrs;
+	int release = 0;
+
+	ep = so->so_upcallarg;
+	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s ec_status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], rep->status);
+	if (!so || !ep) {
+		panic("bogosity ep %p state %d, so %p state %x\n", ep, ep ? ep->com.state : -1, so, so ? so->so_state : -1); 
+	}
+	mtx_lock(&ep->com.lock);
+	switch (ep->com.state) {
+	case CLOSING:
+		if (!rep->status)
+			__state_set(&ep->com, MORIBUND);
+		else
+			__state_set(&ep->com, ABORTING);
+		break;
+	case MORIBUND:
+		stop_ep_timer(ep);
+		if (!rep->status) {
+			if ((ep->com.cm_id) && (ep->com.qp)) {
+				attrs.next_state = IWCH_QP_STATE_IDLE;
+				iwch_modify_qp(ep->com.qp->rhp,
+					     ep->com.qp,
+					     IWCH_QP_ATTR_NEXT_STATE,
+					     &attrs, 1);
+			}
+			close_socket(&ep->com);
+			close_complete_upcall(ep);
+			__state_set(&ep->com, DEAD);
+			release = 1;
+		}
+		break;
+	case DEAD:
+		break;
+	default:
+		panic("unknown state: %d\n", ep->com.state);
+	}
+	mtx_unlock(&ep->com.lock);
+	if (rep->status) {
+		log(LOG_ERR, "%s BAD CLOSE - Aborting tid %u\n",
+		       __FUNCTION__, ep->hwtid);
+		attrs.next_state = IWCH_QP_STATE_ERROR;
+		iwch_modify_qp(ep->com.qp->rhp,
+			       ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+			       &attrs, 1);
+	}
+	if (release)
+		put_ep(&ep->com);
+	return CPL_RET_BUF_DONE;
+}
+
+static void
+ep_timeout(void *arg)
+{
+	struct iwch_ep *ep = (struct iwch_ep *)arg;
+	struct iwch_qp_attributes attrs;
+	int err = 0;
+
+	mtx_lock(&ep->com.lock);
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		connect_reply_upcall(ep, -ETIMEDOUT);
+		break;
+	case MPA_REQ_WAIT:
+		break;
+	case CLOSING:
+	case MORIBUND:
+		if (ep->com.cm_id && ep->com.qp)
+			err = 1;
+		break;
+	default:
+		panic("unknown state: %d\n", ep->com.state);
+	}
+	__state_set(&ep->com, ABORTING);
+	mtx_unlock(&ep->com.lock);
+	if (err){
+		attrs.next_state = IWCH_QP_STATE_ERROR;
+		iwch_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+			     &attrs, 1);
+	}
+	abort_connection(ep);
+	put_ep(&ep->com);
+}
+
+int
+iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int err;
+	struct iwch_ep *ep = to_ep(cm_id);
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+
+	if (state_read(&ep->com) == DEAD) {
+		put_ep(&ep->com);
+		return (-ECONNRESET);
+	}
+	PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD);
+	if (mpa_rev == 0) {
+		abort_connection(ep);
+	} else {
+		err = send_mpa_reject(ep, pdata, pdata_len);
+		err = soshutdown(ep->com.so, 3);
+	}
+	return 0;
+}
+
+int
+iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err;
+	struct iwch_qp_attributes attrs;
+	enum iwch_qp_attr_mask mask;
+	struct iwch_ep *ep = to_ep(cm_id);
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_qp *qp = get_qhp(h, conn_param->qpn);
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	if (state_read(&ep->com) == DEAD)
+		return (-ECONNRESET);
+
+	PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD);
+	PANIC_IF(!qp);
+
+	if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) ||
+	    (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) {
+		abort_connection(ep);
+		return (-EINVAL);
+	}
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = qp;
+
+	ep->com.rpl_err = 0;
+	ep->com.rpl_done = 0;
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+	CTR3(KTR_IW_CXGB, "%s ird %d ord %d", __FUNCTION__, ep->ird, ep->ord);
+	get_ep(&ep->com);
+
+	/* bind QP to EP and move to RTS */
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ord;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = IWCH_QP_STATE_RTS;
+
+	/* bind QP and TID with INIT_WR */
+	mask = IWCH_QP_ATTR_NEXT_STATE |
+			     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+			     IWCH_QP_ATTR_MPA_ATTR |
+			     IWCH_QP_ATTR_MAX_IRD |
+			     IWCH_QP_ATTR_MAX_ORD;
+
+	err = iwch_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+
+	if (err) 
+		goto err;
+
+	err = send_mpa_reply(ep, conn_param->private_data,
+ 			     conn_param->private_data_len);
+	if (err)
+		goto err;
+	state_set(&ep->com, FPDU_MODE);
+	established_upcall(ep);
+	put_ep(&ep->com);
+	return 0;
+err:
+	ep->com.cm_id = NULL;
+	ep->com.qp = NULL;
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+	return err;
+}
+
+static int init_sock(struct iwch_ep_common *epc)
+{
+	int err;
+	struct sockopt sopt;
+	int on=1;
+
+	epc->so->so_upcall = iwch_so_upcall;
+	epc->so->so_upcallarg = epc;
+	epc->so->so_rcv.sb_flags |= SB_UPCALL;
+	epc->so->so_state |= SS_NBIO;
+	sopt.sopt_dir = SOPT_SET;
+	sopt.sopt_level = SOL_SOCKET;
+	sopt.sopt_name = SO_NO_DDP;
+	sopt.sopt_val = (caddr_t)&on;
+	sopt.sopt_valsize = sizeof on;
+	sopt.sopt_td = NULL;
+	err = sosetopt(epc->so, &sopt);
+	if (err) 
+		printf("%s can't set SO_NO_DDP err %d\n", __FUNCTION__, err);
+	sopt.sopt_dir = SOPT_SET;
+	sopt.sopt_level = IPPROTO_TCP;
+	sopt.sopt_name = TCP_NODELAY;
+	sopt.sopt_val = (caddr_t)&on;
+	sopt.sopt_valsize = sizeof on;
+	sopt.sopt_td = NULL;
+	err = sosetopt(epc->so, &sopt);
+	if (err) 
+		printf("%s can't set TCP_NODELAY err %d\n", __FUNCTION__, err);
+
+	return 0;
+}
+
+static int 
+is_loopback_dst(struct iw_cm_id *cm_id)
+{
+	uint16_t port = cm_id->remote_addr.sin_port;
+	struct ifaddr *ifa;
+
+	cm_id->remote_addr.sin_port = 0;
+	ifa = ifa_ifwithaddr((struct sockaddr *)&cm_id->remote_addr);
+	cm_id->remote_addr.sin_port = port;
+	return (ifa != NULL);
+}
+
+int
+iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err = 0;
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_ep *ep;
+	struct rtentry *rt;
+	struct toedev *tdev;
+	
+	if (is_loopback_dst(cm_id)) {
+		err = -ENOSYS;
+		goto out;
+	}
+
+	ep = alloc_ep(sizeof(*ep), M_NOWAIT);
+	if (!ep) {
+		printf("%s - cannot alloc ep.\n", __FUNCTION__);
+		err = (-ENOMEM);
+		goto out;
+	}
+	callout_init(&ep->timer, TRUE);
+	ep->plen = conn_param->private_data_len;
+	if (ep->plen)
+		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
+		       conn_param->private_data, ep->plen);
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = get_qhp(h, conn_param->qpn);
+	ep->com.thread = curthread;
+	PANIC_IF(!ep->com.qp);
+	CTR4(KTR_IW_CXGB, "%s qpn 0x%x qp %p cm_id %p", __FUNCTION__, conn_param->qpn,
+	     ep->com.qp, cm_id);
+
+	ep->com.so = cm_id->so;
+	err = init_sock(&ep->com);
+	if (err)
+		goto fail2;
+
+	/* find a route */
+	rt = find_route(cm_id->local_addr.sin_addr.s_addr,
+			cm_id->remote_addr.sin_addr.s_addr,
+			cm_id->local_addr.sin_port,
+			cm_id->remote_addr.sin_port, IPTOS_LOWDELAY);
+	if (!rt) {
+		printf("%s - cannot find route.\n", __FUNCTION__);
+		err = EHOSTUNREACH;
+		goto fail2;
+	}
+
+	if (!(rt->rt_ifp->if_flags & IFCAP_TOE)) {
+		printf("%s - interface not TOE capable.\n", __FUNCTION__);
+		goto fail3;
+	}
+	tdev = TOEDEV(rt->rt_ifp);
+	if (tdev == NULL) {
+		printf("%s - No toedev for interface.\n", __FUNCTION__);
+		goto fail3;
+	}
+	if (!tdev->tod_can_offload(tdev, ep->com.so)) {
+		printf("%s - interface cannot offload!.\n", __FUNCTION__);
+		goto fail3;
+	}
+	RTFREE(rt);
+
+	state_set(&ep->com, CONNECTING);
+	ep->com.local_addr = cm_id->local_addr;
+	ep->com.remote_addr = cm_id->remote_addr;
+	err = soconnect(ep->com.so, (struct sockaddr *)&ep->com.remote_addr, 
+		ep->com.thread);
+	if (!err)
+		goto out;
+fail3:
+	RTFREE(ep->dst);
+fail2:
+	put_ep(&ep->com);
+out:
+	return err;
+}
+
+int
+iwch_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	int err = 0;
+	struct iwch_listen_ep *ep;
+
+	ep = alloc_ep(sizeof(*ep), M_NOWAIT);
+	if (!ep) {
+		printf("%s - cannot alloc ep.\n", __FUNCTION__);
+		err = ENOMEM;
+		goto out;
+	}
+	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->backlog = backlog;
+	ep->com.local_addr = cm_id->local_addr;
+	ep->com.thread = curthread;
+	state_set(&ep->com, LISTEN);
+
+	ep->com.so = cm_id->so;
+	err = init_sock(&ep->com);
+	if (err)
+		goto fail;
+
+	err = solisten(ep->com.so, ep->backlog, ep->com.thread);
+	if (!err) {
+		cm_id->provider_data = ep;
+		goto out;
+	}
+	close_socket(&ep->com);
+fail:
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+out:
+	return err;
+}
+
+int
+iwch_destroy_listen(struct iw_cm_id *cm_id)
+{
+	struct iwch_listen_ep *ep = to_listen_ep(cm_id);
+
+	CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep);
+
+	state_set(&ep->com, DEAD);
+	close_socket(&ep->com);
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+	return 0;
+}
+
+int
+iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags)
+{
+	int close = 0;
+
+	mtx_lock(&ep->com.lock);
+
+	PANIC_IF(!ep);
+	PANIC_IF(!ep->com.so);
+
+	CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s, abrupt %d", __FUNCTION__, ep,
+	     ep->com.so, states[ep->com.state], abrupt);
+
+	if (ep->com.state == DEAD) {
+		CTR2(KTR_IW_CXGB, "%s already dead ep %p", __FUNCTION__, ep);
+		goto out;
+	}
+
+	if (abrupt) {
+		if (ep->com.state != ABORTING) {
+			ep->com.state = ABORTING;
+			close = 1;
+		}
+		goto out;
+	}
+
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+	case MPA_REQ_SENT:
+	case MPA_REQ_RCVD:
+	case MPA_REP_SENT:
+	case FPDU_MODE:
+		start_ep_timer(ep);
+		ep->com.state = CLOSING;
+		close = 1;
+		break;
+	case CLOSING:
+		ep->com.state = MORIBUND;
+		close = 1;
+		break;
+	case MORIBUND:
+	case ABORTING:
+		break;
+	default:
+		panic("unknown state: %d\n", ep->com.state);
+		break;
+	}
+out:
+	mtx_unlock(&ep->com.lock);
+	if (close) {
+		if (abrupt)
+			abort_connection(ep);
+		else
+			shutdown_socket(&ep->com);
+	}
+	return 0;
+}
+
+static void
+process_data(struct iwch_ep *ep)
+{
+	struct sockaddr_in *local, *remote;
+
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+
+	switch (state_read(&ep->com)) {
+	case MPA_REQ_SENT:
+		process_mpa_reply(ep);
+		break;
+	case MPA_REQ_WAIT:
+
+		/*
+		 * XXX
+		 * Set local and remote addrs here because when we
+		 * dequeue the newly accepted socket, they aren't set
+		 * yet in the pcb!
+		 */
+		in_getsockaddr(ep->com.so, (struct sockaddr **)&local);
+		in_getpeeraddr(ep->com.so, (struct sockaddr **)&remote);
+		CTR3(KTR_IW_CXGB, "%s local %s remote %s", __FUNCTION__, 
+			inet_ntoa(local->sin_addr),
+			inet_ntoa(remote->sin_addr));
+		ep->com.local_addr = *local;
+		ep->com.remote_addr = *remote;
+		free(local, M_SONAME);
+		free(remote, M_SONAME);
+		process_mpa_request(ep);
+		break;
+	default:
+		if (ep->com.so->so_rcv.sb_cc) 
+			printf("%s Unexpected streaming data."
+			       " ep %p state %d so %p so_state %x so_rcv.sb_cc %u so_rcv.sb_mb %p\n",
+			       __FUNCTION__, ep, state_read(&ep->com), ep->com.so, ep->com.so->so_state,
+			       ep->com.so->so_rcv.sb_cc, ep->com.so->so_rcv.sb_mb);
+		break;
+	}
+	return;
+}
+
+static void
+process_connected(struct iwch_ep *ep)
+{
+	CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]);
+	if ((ep->com.so->so_state & SS_ISCONNECTED) && !ep->com.so->so_error) {
+		send_mpa_req(ep);
+	} else {
+		connect_reply_upcall(ep, -ep->com.so->so_error);
+		close_socket(&ep->com);
+		state_set(&ep->com, DEAD);
+		put_ep(&ep->com);
+	}
+}
+
+static struct socket *
+dequeue_socket(struct socket *head, struct sockaddr_in **remote, struct iwch_ep *child_ep)
+{
+	struct socket *so;
+
+	ACCEPT_LOCK();
+	so = TAILQ_FIRST(&head->so_comp);
+	if (!so) {
+		ACCEPT_UNLOCK();
+		return NULL;
+	}
+	TAILQ_REMOVE(&head->so_comp, so, so_list);
+	head->so_qlen--;
+	SOCK_LOCK(so);
+	so->so_qstate &= ~SQ_COMP;
+	so->so_head = NULL;
+	soref(so);
+	so->so_rcv.sb_flags |= SB_UPCALL;
+	so->so_state |= SS_NBIO;
+	so->so_upcall = iwch_so_upcall;
+	so->so_upcallarg = child_ep;
+	PANIC_IF(!(so->so_state & SS_ISCONNECTED));
+	PANIC_IF(so->so_error);
+	SOCK_UNLOCK(so);
+	ACCEPT_UNLOCK();
+	soaccept(so, (struct sockaddr **)remote);
+	return so;
+}
+
+static void
+process_newconn(struct iwch_ep *parent_ep)
+{
+	struct socket *child_so;
+	struct iwch_ep *child_ep;
+	struct sockaddr_in *remote;
+
+	CTR3(KTR_IW_CXGB, "%s parent ep %p so %p", __FUNCTION__, parent_ep, parent_ep->com.so);
+	child_ep = alloc_ep(sizeof(*child_ep), M_NOWAIT);
+	if (!child_ep) {
+		log(LOG_ERR, "%s - failed to allocate ep entry!\n",
+		       __FUNCTION__);
+		return;
+	}
+	child_so = dequeue_socket(parent_ep->com.so, &remote, child_ep);
+	if (!child_so) {
+		log(LOG_ERR, "%s - failed to dequeue child socket!\n",
+		       __FUNCTION__);
+		__free_ep(&child_ep->com);
+		return;
+	}
+	CTR3(KTR_IW_CXGB, "%s remote addr %s port %d", __FUNCTION__, 
+		inet_ntoa(remote->sin_addr), ntohs(remote->sin_port));
+	child_ep->com.so = child_so;
+	child_ep->com.cm_id = NULL;
+	child_ep->com.thread = parent_ep->com.thread;
+	child_ep->parent_ep = parent_ep;
+	free(remote, M_SONAME);
+	get_ep(&parent_ep->com);
+	child_ep->parent_ep = parent_ep;
+	callout_init(&child_ep->timer, TRUE);
+	state_set(&child_ep->com, MPA_REQ_WAIT);
+	start_ep_timer(child_ep);
+
+	/* maybe the request has already been queued up on the socket... */
+	process_mpa_request(child_ep);
+}
+
+static void
+iwch_so_upcall(struct socket *so, void *arg, int waitflag)
+{
+	struct iwch_ep *ep = arg;
+
+	CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]);
+	mtx_lock(&req_lock);
+	if (ep && ep->com.so && !ep->com.entry.tqe_prev) {
+		get_ep(&ep->com);
+		TAILQ_INSERT_TAIL(&req_list, &ep->com, entry);
+		taskqueue_enqueue(iw_cxgb_taskq, &iw_cxgb_task);
+	}
+	mtx_unlock(&req_lock);
+}
+
+static void
+process_socket_event(struct iwch_ep *ep)
+{
+	int state = state_read(&ep->com);
+	struct socket *so = ep->com.so;
+	
+	CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]);
+	if (state == CONNECTING) {
+		process_connected(ep);
+		return;
+	}
+
+	if (state == LISTEN) {
+		process_newconn(ep);
+		return;
+	}
+
+	/* connection error */
+	if (so->so_error) {
+		process_conn_error(ep);
+		return;
+	}
+
+	/* peer close */
+	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && state < CLOSING) {
+		process_peer_close(ep);
+		return;
+	}
+
+	/* close complete */
+	if (so->so_state & (SS_ISDISCONNECTED)) {
+		process_close_complete(ep);
+		return;
+	}
+	
+	/* rx data */
+	process_data(ep);
+	return;
+}
+
+static void
+process_req(void *ctx, int pending)
+{
+	struct iwch_ep_common *epc;
+
+	CTR1(KTR_IW_CXGB, "%s enter", __FUNCTION__);
+	mtx_lock(&req_lock);
+	while (!TAILQ_EMPTY(&req_list)) {
+		epc = TAILQ_FIRST(&req_list);
+		TAILQ_REMOVE(&req_list, epc, entry);
+		epc->entry.tqe_prev = NULL;
+		mtx_unlock(&req_lock);
+		if (epc->so)
+			process_socket_event((struct iwch_ep *)epc);
+		put_ep(epc);
+		mtx_lock(&req_lock);
+	}
+	mtx_unlock(&req_lock);
+}
+
+int
+iwch_cm_init(void)
+{
+	TAILQ_INIT(&req_list);
+	mtx_init(&req_lock, "iw_cxgb req_list lock", NULL, MTX_DEF);
+	iw_cxgb_taskq = taskqueue_create("iw_cxgb_taskq", M_NOWAIT,
+		taskqueue_thread_enqueue, &iw_cxgb_taskq);
+        if (iw_cxgb_taskq == NULL) {
+                printf("failed to allocate iw_cxgb taskqueue\n");
+                return (ENOMEM);
+        }
+        taskqueue_start_threads(&iw_cxgb_taskq, 1, PI_NET, "iw_cxgb taskq");
+        TASK_INIT(&iw_cxgb_task, 0, process_req, NULL);
+	t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, terminate);
+	t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, ec_status);
+	return 0;
+}
+
+void
+iwch_cm_term(void)
+{
+	t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, NULL);
+	t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, NULL);
+	taskqueue_drain(iw_cxgb_taskq, &iw_cxgb_task);
+	taskqueue_free(iw_cxgb_taskq);
+}
+
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h
new file mode 100644
index 0000000000000..4250be33300ad
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h
@@ -0,0 +1,249 @@
+/**************************************************************************
+
+Copyright (c) 2007, 2008 Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD$
+
+***************************************************************************/
+
+#ifndef _IWCH_CM_H_
+#define _IWCH_CM_H_
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/iw_cm.h>
+#include <sys/refcount.h>
+#include <sys/condvar.h>
+#include <sys/proc.h>
+
+
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+
+#define MPA_MAX_PRIVATE_DATA	256
+#define MPA_REV		o0	/* XXX - amso1100 uses rev 0 ! */
+#define MPA_REJECT		0x20
+#define MPA_CRC			0x40
+#define MPA_MARKERS		0x80
+#define MPA_FLAGS_MASK		0xE0
+
+#define put_ep(ep) { \
+	CTR4(KTR_IW_CXGB, "put_ep (via %s:%u) ep %p refcnt %d\n", __FUNCTION__, __LINE__,  \
+	     ep, atomic_load_acq_int(&((ep)->refcount))); \
+	if (refcount_release(&((ep)->refcount)))  \
+		__free_ep(ep); \
+}
+
+#define get_ep(ep) { \
+	CTR4(KTR_IW_CXGB, "get_ep (via %s:%u) ep %p, refcnt %d\n", __FUNCTION__, __LINE__, \
+	     ep, atomic_load_acq_int(&((ep)->refcount))); \
+	refcount_acquire(&((ep)->refcount));	  \
+}
+
+struct mpa_message {
+	u8 key[16];
+	u8 flags;
+	u8 revision;
+	__be16 private_data_size;
+	u8 private_data[0];
+};
+
+struct terminate_message {
+	u8 layer_etype;
+	u8 ecode;
+	__be16 hdrct_rsvd;
+	u8 len_hdrs[0];
+};
+
+#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
+
+enum iwch_layers_types {
+	LAYER_RDMAP		= 0x00,
+	LAYER_DDP		= 0x10,
+	LAYER_MPA		= 0x20,
+	RDMAP_LOCAL_CATA	= 0x00,
+	RDMAP_REMOTE_PROT	= 0x01,
+	RDMAP_REMOTE_OP		= 0x02,
+	DDP_LOCAL_CATA		= 0x00,
+	DDP_TAGGED_ERR		= 0x01,
+	DDP_UNTAGGED_ERR	= 0x02,
+	DDP_LLP			= 0x03
+};
+
+enum iwch_rdma_ecodes {
+	RDMAP_INV_STAG		= 0x00,
+	RDMAP_BASE_BOUNDS	= 0x01,
+	RDMAP_ACC_VIOL		= 0x02,
+	RDMAP_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_TO_WRAP		= 0x04,
+	RDMAP_INV_VERS		= 0x05,
+	RDMAP_INV_OPCODE	= 0x06,
+	RDMAP_STREAM_CATA	= 0x07,
+	RDMAP_GLOBAL_CATA	= 0x08,
+	RDMAP_CANT_INV_STAG	= 0x09,
+	RDMAP_UNSPECIFIED	= 0xff
+};
+
+enum iwch_ddp_ecodes {
+	DDPT_INV_STAG		= 0x00,
+	DDPT_BASE_BOUNDS	= 0x01,
+	DDPT_STAG_NOT_ASSOC	= 0x02,
+	DDPT_TO_WRAP		= 0x03,
+	DDPT_INV_VERS		= 0x04,
+	DDPU_INV_QN		= 0x01,
+	DDPU_INV_MSN_NOBUF	= 0x02,
+	DDPU_INV_MSN_RANGE	= 0x03,
+	DDPU_INV_MO		= 0x04,
+	DDPU_MSG_TOOBIG		= 0x05,
+	DDPU_INV_VERS		= 0x06
+};
+
+enum iwch_mpa_ecodes {
+	MPA_CRC_ERR		= 0x02,
+	MPA_MARKER_ERR		= 0x03
+};
+
+enum iwch_ep_state {
+	IDLE = 0,
+	LISTEN,
+	CONNECTING,
+	MPA_REQ_WAIT,
+	MPA_REQ_SENT,
+	MPA_REQ_RCVD,
+	MPA_REP_SENT,
+	FPDU_MODE,
+	ABORTING,
+	CLOSING,
+	MORIBUND,
+	DEAD,
+};
+
+enum iwch_ep_flags {
+	PEER_ABORT_IN_PROGRESS	= (1 << 0),
+	ABORT_REQ_IN_PROGRESS	= (1 << 1),
+};
+
+struct iwch_ep_common {
+	TAILQ_ENTRY(iwch_ep_common) entry;
+	struct iw_cm_id *cm_id;
+	struct iwch_qp *qp;
+	struct t3cdev *tdev;
+	enum iwch_ep_state state;
+	u_int refcount;
+	struct cv waitq;
+	struct mtx lock;
+	struct sockaddr_in local_addr;
+	struct sockaddr_in remote_addr;
+	int rpl_err;
+	int rpl_done;
+	struct thread *thread;
+	struct socket *so;
+};
+
+struct iwch_listen_ep {
+	struct iwch_ep_common com;
+	unsigned int stid;
+	int backlog;
+};
+
+struct iwch_ep {
+	struct iwch_ep_common com;
+	struct iwch_ep *parent_ep;
+	struct callout timer;
+	unsigned int atid;
+	u32 hwtid;
+	u32 snd_seq;
+	u32 rcv_seq;
+	struct l2t_entry *l2t;
+	struct rtentry *dst;
+	struct mbuf *mpa_mbuf;
+	struct iwch_mpa_attributes mpa_attr;
+	unsigned int mpa_pkt_len;
+	u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
+	u8 tos;
+	u16 emss;
+	u16 plen;
+	u32 ird;
+	u32 ord;
+	u32 flags;
+};
+
+static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline int compute_wscale(int win)
+{
+	int wscale = 0;
+
+	while (wscale < 14 && (65535<<wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+static __inline void
+iwch_wait(struct cv *cv, struct mtx *lock, int *rpl_done)
+{
+	mtx_lock(lock);
+	if (!*rpl_done) {
+		CTR0(KTR_IW_CXGB, "sleeping for rpl_done\n");
+		cv_wait_unlock(cv, lock);
+	}
+	CTR1(KTR_IW_CXGB, "*rpl_done=%d\n", *rpl_done);
+}
+
+static __inline void
+iwch_wakeup(struct cv *cv, struct mtx *lock, int *rpl_done)
+{
+	mtx_lock(lock);
+	*rpl_done=1;	
+	CTR0(KTR_IW_CXGB, "wakeup for rpl_done\n");
+	cv_broadcast(cv);
+	mtx_unlock(lock);	
+}
+
+/* CM prototypes */
+
+int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_create_listen(struct iw_cm_id *cm_id, int backlog);
+int iwch_destroy_listen(struct iw_cm_id *cm_id);
+int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags);
+int iwch_quiesce_tid(struct iwch_ep *ep);
+int iwch_resume_tid(struct iwch_ep *ep);
+void __free_ep(struct iwch_ep_common *ep);
+void iwch_rearp(struct iwch_ep *ep);
+int iwch_ep_redirect(void *ctx, struct rtentry *old, struct rtentry *new, struct l2t_entry *l2t);
+
+int iwch_cm_init(void);
+void iwch_cm_term(void);
+
+#endif				/* _IWCH_CM_H_ */
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c
new file mode 100644
index 0000000000000..93a9e89fd39d2
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c
@@ -0,0 +1,276 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/libkern.h>
+
+#include <netinet/in.h>
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/ib_umem.h>
+#include <contrib/rdma/ib_user_verbs.h>
+
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#include <ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <ulp/iw_cxgb/iw_cxgb_user.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h>
+#endif
+
+/*
+ * Get one cq entry from cxio and map it to openib.
+ *
+ * Returns:
+ *	0			cqe returned
+ *	-ENOBUFS		EMPTY;
+ *	-EAGAIN		        caller must try again
+ *	any other neg errno	fatal error
+ */
+static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
+			    struct ib_wc *wc)
+{
+	struct iwch_qp *qhp = NULL;
+	struct t3_cqe cqe, *rd_cqe;
+	struct t3_wq *wq;
+	u32 credit = 0;
+	u8 cqe_flushed;
+	u64 cookie;
+	int ret = 1;
+
+	rd_cqe = cxio_next_cqe(&chp->cq);
+
+	if (!rd_cqe)
+		return 0;
+
+	qhp = get_qhp(rhp, CQE_QPID(*rd_cqe));
+	if (!qhp)
+		wq = NULL;
+	else {
+		mtx_lock(&qhp->lock);
+		wq = &(qhp->wq);
+	}
+	ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie,
+				   &credit);
+	if (t3a_device(chp->rhp) && credit) {
+		CTR3(KTR_IW_CXGB, "%s updating %d cq credits on id %d", __FUNCTION__,
+		     credit, chp->cq.cqid);
+		cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit);
+	}
+
+	if (ret) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	ret = 1;
+
+	wc->wr_id = cookie;
+	wc->qp = &qhp->ibqp;
+	wc->vendor_err = CQE_STATUS(cqe);
+
+	CTR4(KTR_IW_CXGB, "iwch_poll_cq_one qpid 0x%x type %d opcode %d status 0x%x",
+	     CQE_QPID(cqe), CQE_TYPE(cqe),
+	     CQE_OPCODE(cqe), CQE_STATUS(cqe));
+	CTR3(KTR_IW_CXGB, "wrid hi 0x%x lo 0x%x cookie 0x%llx", 
+	     CQE_WRID_HI(cqe), CQE_WRID_LOW(cqe), (unsigned long long) cookie);
+
+	if (CQE_TYPE(cqe) == 0) {
+		if (!CQE_STATUS(cqe))
+			wc->byte_len = CQE_LEN(cqe);
+		else
+			wc->byte_len = 0;
+		wc->opcode = IB_WC_RECV;
+	} else {
+		switch (CQE_OPCODE(cqe)) {
+		case T3_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case T3_READ_REQ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = CQE_LEN(cqe);
+			break;
+		case T3_SEND:
+		case T3_SEND_WITH_SE:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case T3_BIND_MW:
+			wc->opcode = IB_WC_BIND_MW;
+			break;
+
+		/* these aren't supported yet */
+		case T3_SEND_WITH_INV:
+		case T3_SEND_WITH_SE_INV:
+		case T3_LOCAL_INV:
+		case T3_FAST_REGISTER:
+		default:
+			log(LOG_ERR, "Unexpected opcode %d "
+			       "in the CQE received for QPID=0x%0x\n",
+			       CQE_OPCODE(cqe), CQE_QPID(cqe));
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (cqe_flushed)
+		wc->status = IB_WC_WR_FLUSH_ERR;
+	else {
+
+		switch (CQE_STATUS(cqe)) {
+		case TPT_ERR_SUCCESS:
+			wc->status = IB_WC_SUCCESS;
+			break;
+		case TPT_ERR_STAG:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case TPT_ERR_PDID:
+			wc->status = IB_WC_LOC_PROT_ERR;
+			break;
+		case TPT_ERR_QPID:
+		case TPT_ERR_ACCESS:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case TPT_ERR_WRAP:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		case TPT_ERR_BOUND:
+			wc->status = IB_WC_LOC_LEN_ERR;
+			break;
+		case TPT_ERR_INVALIDATE_SHARED_MR:
+		case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+			wc->status = IB_WC_MW_BIND_ERR;
+			break;
+		case TPT_ERR_CRC:
+		case TPT_ERR_MARKER:
+		case TPT_ERR_PDU_LEN_ERR:
+		case TPT_ERR_OUT_OF_RQE:
+		case TPT_ERR_DDP_VERSION:
+		case TPT_ERR_RDMA_VERSION:
+		case TPT_ERR_DDP_QUEUE_NUM:
+		case TPT_ERR_MSN:
+		case TPT_ERR_TBIT:
+		case TPT_ERR_MO:
+		case TPT_ERR_MSN_RANGE:
+		case TPT_ERR_IRD_OVERFLOW:
+		case TPT_ERR_OPCODE:
+			wc->status = IB_WC_FATAL_ERR;
+			break;
+		case TPT_ERR_SWFLUSH:
+			wc->status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			log(LOG_ERR, "Unexpected cqe_status 0x%x for "
+			       "QPID=0x%0x\n", CQE_STATUS(cqe), CQE_QPID(cqe));
+			ret = -EINVAL;
+		}
+	}
+out:
+	if (wq)
+		mtx_unlock(&qhp->lock);
+	return ret;
+}
+
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	int npolled;
+	int err = 0;
+
+	chp = to_iwch_cq(ibcq);
+	rhp = chp->rhp;
+
+	mtx_lock(&chp->lock);
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+#ifdef DEBUG
+		int i=0;
+#endif
+
+		/*
+		 * Because T3 can post CQEs that are _not_ associated
+		 * with a WR, we might have to poll again after removing
+		 * one of these.
+		 */
+		do {
+			err = iwch_poll_cq_one(rhp, chp, wc + npolled);
+#ifdef DEBUG
+			PANIC_IF(++i > 1000);
+#endif
+		} while (err == -EAGAIN);
+		if (err <= 0)
+			break;
+	}
+	mtx_unlock(&chp->lock);
+
+	if (err < 0) {
+		return err;
+	} else {
+		return npolled;
+	}
+}
+
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c
new file mode 100644
index 0000000000000..8b52119e306ec
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c
@@ -0,0 +1,255 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/libkern.h>
+
+#include <netinet/in.h>
+
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/ib_umem.h>
+#include <contrib/rdma/ib_user_verbs.h>
+
+#ifdef DEBUG
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#include <ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <ulp/iw_cxgb/iw_cxgb_user.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h>
+#endif
+
+void cxio_dump_tpt(struct cxio_rdev *rdev, uint32_t stag)
+{
+	struct ch_mem_range *m;
+	u64 *data;
+	int rc;
+	int size = 32;
+
+	m = kmalloc(sizeof(*m) + size, M_NOWAIT);
+	if (!m) {
+		CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__);
+		return;
+	}
+	m->mem_id = MEM_PMRX;
+	m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base;
+	m->len = size;
+	CTR3(KTR_IW_CXGB, "%s TPT addr 0x%x len %d", __FUNCTION__, m->addr, m->len);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc);
+		free(m, M_DEVBUF);
+		return;
+	}
+
+	data = (u64 *)m->buf;
+	while (size > 0) {
+		CTR2(KTR_IW_CXGB, "TPT %08x: %016llx", m->addr, (unsigned long long) *data);
+		size -= 8;
+		data++;
+		m->addr += 8;
+	}
+	free(m, M_DEVBUF);
+}
+
+void cxio_dump_pbl(struct cxio_rdev *rdev, uint32_t pbl_addr, uint32_t len, u8 shift)
+{
+	struct ch_mem_range *m;
+	u64 *data;
+	int rc;
+	int size, npages;
+
+	shift += 12;
+	npages = (len + (1ULL << shift) - 1) >> shift;
+	size = npages * sizeof(u64);
+
+	m = kmalloc(sizeof(*m) + size, M_NOWAIT);
+	if (!m) {
+		CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__);
+		return;
+	}
+	m->mem_id = MEM_PMRX;
+	m->addr = pbl_addr;
+	m->len = size;
+	CTR4(KTR_IW_CXGB, "%s PBL addr 0x%x len %d depth %d",
+		__FUNCTION__, m->addr, m->len, npages);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc);
+		free(m, M_DEVBUF);
+		return;
+	}
+
+	data = (u64 *)m->buf;
+	while (size > 0) {
+		CTR2(KTR_IW_CXGB, "PBL %08x: %016llx", m->addr, (unsigned long long) *data);
+		size -= 8;
+		data++;
+		m->addr += 8;
+	}
+	free(m, M_DEVBUF);
+}
+
+void cxio_dump_wqe(union t3_wr *wqe)
+{
+	uint64_t *data = (uint64_t *)wqe;
+	uint32_t size = (uint32_t)(be64toh(*data) & 0xff);
+
+	if (size == 0)
+		size = 8;
+	while (size > 0) {
+		CTR2(KTR_IW_CXGB, "WQE %p: %016llx", data,
+		     (unsigned long long) be64toh(*data));
+		size--;
+		data++;
+	}
+}
+
+void cxio_dump_wce(struct t3_cqe *wce)
+{
+	uint64_t *data = (uint64_t *)wce;
+	int size = sizeof(*wce);
+
+	while (size > 0) {
+		CTR2(KTR_IW_CXGB, "WCE %p: %016llx", data,
+		     (unsigned long long) be64toh(*data));
+		size -= 8;
+		data++;
+	}
+}
+
+void cxio_dump_rqt(struct cxio_rdev *rdev, uint32_t hwtid, int nents)
+{
+	struct ch_mem_range *m;
+	int size = nents * 64;
+	u64 *data;
+	int rc;
+
+	m = kmalloc(sizeof(*m) + size, M_NOWAIT);
+	if (!m) {
+		CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__);
+		return;
+	}
+	m->mem_id = MEM_PMRX;
+	m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base;
+	m->len = size;
+	CTR3(KTR_IW_CXGB, "%s RQT addr 0x%x len %d", __FUNCTION__, m->addr, m->len);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc);
+		free(m, M_DEVBUF);
+		return;
+	}
+
+	data = (u64 *)m->buf;
+	while (size > 0) {
+		CTR2(KTR_IW_CXGB, "RQT %08x: %016llx", m->addr, (unsigned long long) *data);
+		size -= 8;
+		data++;
+		m->addr += 8;
+	}
+	free(m, M_DEVBUF);
+}
+
+void cxio_dump_tcb(struct cxio_rdev *rdev, uint32_t hwtid)
+{
+	struct ch_mem_range *m;
+	int size = TCB_SIZE;
+	uint32_t *data;
+	int rc;
+
+	m = kmalloc(sizeof(*m) + size, M_NOWAIT);
+	if (!m) {
+		CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__);
+		return;
+	}
+	m->mem_id = MEM_CM;
+	m->addr = hwtid * size;
+	m->len = size;
+	CTR3(KTR_IW_CXGB, "%s TCB %d len %d", __FUNCTION__, m->addr, m->len);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc);
+		free(m, M_DEVBUF);
+		return;
+	}
+
+	data = (uint32_t *)m->buf;
+	while (size > 0) {
+		printf("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n",
+			m->addr,
+			*(data+2), *(data+3), *(data),*(data+1),
+			*(data+6), *(data+7), *(data+4), *(data+5));
+		size -= 32;
+		data += 8;
+		m->addr += 32;
+	}
+	free(m, M_DEVBUF);
+}
+#endif
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c
new file mode 100644
index 0000000000000..2e8154731133d
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c
@@ -0,0 +1,265 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/libkern.h>
+
+#include <netinet/in.h>
+
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/ib_umem.h>
+#include <contrib/rdma/ib_user_verbs.h>
+
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#include <ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <ulp/iw_cxgb/iw_cxgb_user.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h>
+#endif
+
+static void
+post_qp_event(struct iwch_dev *rnicp, struct iwch_qp *qhp, struct iwch_cq *chp,
+		struct respQ_msg_t *rsp_msg,
+		enum ib_event_type ib_event,
+		int send_term)
+{
+	struct ib_event event;
+	struct iwch_qp_attributes attrs;
+
+	if ((qhp->attr.state == IWCH_QP_STATE_ERROR) ||
+	    (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) {
+		CTR4(KTR_IW_CXGB, "%s AE received after RTS - "
+		     "qp state %d qpid 0x%x status 0x%x", __FUNCTION__,
+		     qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe));
+		return;
+	}
+
+	log(LOG_ERR, "%s - AE qpid 0x%x opcode %d status 0x%x "
+	       "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__,
+	       CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
+	       CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
+	       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+
+
+	event.event = ib_event;
+	event.device = chp->ibcq.device;
+	if (ib_event == IB_EVENT_CQ_ERR)
+		event.element.cq = &chp->ibcq;
+	else
+		event.element.qp = &qhp->ibqp;
+
+	if (qhp->ibqp.event_handler)
+		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
+
+	if (qhp->attr.state == IWCH_QP_STATE_RTS) {
+		attrs.next_state = IWCH_QP_STATE_TERMINATE;
+		iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE,
+			       &attrs, 1);
+		if (send_term)
+			iwch_post_terminate(qhp, rsp_msg);
+	}
+}
+
+void
+iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m)
+{
+	struct iwch_dev *rnicp;
+	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) m->m_data;
+	struct iwch_cq *chp;
+	struct iwch_qp *qhp;
+	u32 cqid = RSPQ_CQID(rsp_msg);
+
+	rnicp = (struct iwch_dev *) rdev_p->ulp;
+	mtx_lock(&rnicp->lock);
+	chp = get_chp(rnicp, cqid);
+	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+	if (!chp || !qhp) {
+		log(LOG_ERR,"BAD AE cqid 0x%x qpid 0x%x opcode %d "
+		       "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x \n",
+		       cqid, CQE_QPID(rsp_msg->cqe),
+		       CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+		       CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe),
+		       CQE_WRID_LOW(rsp_msg->cqe));
+		mtx_unlock(&rnicp->lock);
+		goto out;
+	}
+	iwch_qp_add_ref(&qhp->ibqp);
+	mtx_lock(&chp->lock);
+	++chp->refcnt;
+	mtx_unlock(&chp->lock);
+	mtx_unlock(&rnicp->lock);
+
+	/*
+	 * 1) completion of our sending a TERMINATE.
+	 * 2) incoming TERMINATE message.
+	 */
+	if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) &&
+	    (CQE_STATUS(rsp_msg->cqe) == 0)) {
+		if (SQ_TYPE(rsp_msg->cqe)) {
+			CTR3(KTR_IW_CXGB, "%s QPID 0x%x ep %p disconnecting",
+			     __FUNCTION__, qhp->wq.qpid, qhp->ep);
+			iwch_ep_disconnect(qhp->ep, 0, M_NOWAIT);
+		} else {
+			CTR2(KTR_IW_CXGB, "%s post REQ_ERR AE QPID 0x%x", __FUNCTION__,
+			     qhp->wq.qpid);
+			post_qp_event(rnicp, qhp, chp, rsp_msg,
+				      IB_EVENT_QP_REQ_ERR, 0);
+			iwch_ep_disconnect(qhp->ep, 0, M_NOWAIT);
+		}
+		goto done;
+	}
+
+	/* Bad incoming Read request */
+	if (SQ_TYPE(rsp_msg->cqe) &&
+	    (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) {
+		post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+		goto done;
+	}
+
+	/* Bad incoming write */
+	if (RQ_TYPE(rsp_msg->cqe) &&
+	    (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) {
+		post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+		goto done;
+	}
+
+	switch (CQE_STATUS(rsp_msg->cqe)) {
+
+	/* Completion Events */
+	case TPT_ERR_SUCCESS:
+#if 0
+		/*
+		 * Confirm the destination entry if this is a RECV completion.
+		 */
+		if (qhp->ep && SQ_TYPE(rsp_msg->cqe))
+			dst_confirm(qhp->ep->dst);
+#endif		
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+		break;
+
+	case TPT_ERR_STAG:
+	case TPT_ERR_PDID:
+	case TPT_ERR_QPID:
+	case TPT_ERR_ACCESS:
+	case TPT_ERR_WRAP:
+	case TPT_ERR_BOUND:
+	case TPT_ERR_INVALIDATE_SHARED_MR:
+	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		log(LOG_ERR, "%s - CQE Err qpid 0x%x opcode %d status 0x%x "
+		       "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__,
+		       CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
+		       CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
+		       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+		post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1);
+		break;
+
+	/* Device Fatal Errors */
+	case TPT_ERR_ECC:
+	case TPT_ERR_ECC_PSTAG:
+	case TPT_ERR_INTERNAL_ERR:
+		post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1);
+		break;
+
+	/* QP Fatal Errors */
+	case TPT_ERR_OUT_OF_RQE:
+	case TPT_ERR_PBL_ADDR_BOUND:
+	case TPT_ERR_CRC:
+	case TPT_ERR_MARKER:
+	case TPT_ERR_PDU_LEN_ERR:
+	case TPT_ERR_DDP_VERSION:
+	case TPT_ERR_RDMA_VERSION:
+	case TPT_ERR_OPCODE:
+	case TPT_ERR_DDP_QUEUE_NUM:
+	case TPT_ERR_MSN:
+	case TPT_ERR_TBIT:
+	case TPT_ERR_MO:
+	case TPT_ERR_MSN_GAP:
+	case TPT_ERR_MSN_RANGE:
+	case TPT_ERR_RQE_ADDR_BOUND:
+	case TPT_ERR_IRD_OVERFLOW:
+		post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+		break;
+
+	default:
+		log(LOG_ERR,"Unknown T3 status 0x%x QPID 0x%x\n",
+		       CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid);
+		post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+		break;
+	}
+done:
+	mtx_lock(&chp->lock);
+	if (--chp->refcnt == 0)
+	        wakeup(chp);
+	mtx_unlock(&chp->lock);
+	iwch_qp_rem_ref(&qhp->ibqp);
+out:
+	m_free(m);
+}
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c
new file mode 100644
index 0000000000000..0309b53ba3c03
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c
@@ -0,0 +1,1418 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <netinet/in.h>
+
+#include <contrib/rdma/ib_verbs.h>
+
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#include <ulp/iw_cxgb/iw_cxgb_resource.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h>
+#endif
+
+static TAILQ_HEAD( ,cxio_rdev) rdev_list;
+static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL;
+
+static struct cxio_rdev *
+cxio_hal_find_rdev_by_name(char *dev_name)
+{
+	struct cxio_rdev *rdev;
+
+	TAILQ_FOREACH(rdev, &rdev_list, entry)
+		if (!strcmp(rdev->dev_name, dev_name))
+			return rdev;
+	return NULL;
+}
+
+struct cxio_rdev *
+cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev)
+{
+	struct cxio_rdev *rdev;
+
+	TAILQ_FOREACH(rdev, &rdev_list, entry)
+		if (rdev->t3cdev_p == tdev)
+			return rdev;
+	return NULL;
+}
+
+int
+cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq,
+		   enum t3_cq_opcode op, u32 credit)
+{
+	int ret;
+	struct t3_cqe *cqe;
+	u32 rptr;
+
+	struct rdma_cq_op setup;
+	setup.id = cq->cqid;
+	setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0;
+	setup.op = op;
+	ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup);
+
+	if ((ret < 0) || (op == CQ_CREDIT_UPDATE))
+		return (ret);
+
+	/*
+	 * If the rearm returned an index other than our current index,
+	 * then there might be CQE's in flight (being DMA'd).  We must wait
+	 * here for them to complete or the consumer can miss a notification.
+	 */
+	if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) {
+		int i=0;
+
+		rptr = cq->rptr;
+
+		/*
+		 * Keep the generation correct by bumping rptr until it
+		 * matches the index returned by the rearm - 1.
+		 */
+		while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret)
+			rptr++;
+
+		/*
+		 * Now rptr is the index for the (last) cqe that was
+		 * in-flight at the time the HW rearmed the CQ.  We
+		 * spin until that CQE is valid.
+		 */
+		cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2);
+		while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
+			DELAY(1);
+			if (i++ > 1000000) {
+				PANIC_IF(1);
+				log(LOG_ERR, "%s: stalled rnic\n",
+				       rdev_p->dev_name);
+				return (-EIO);
+			}
+		}
+
+		return 1;
+	}
+
+	return 0;
+}
+
+static int
+cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid)
+{
+	struct rdma_cq_setup setup;
+	setup.id = cqid;
+	setup.base_addr = 0;	/* NULL address */
+	setup.size = 0;		/* disaable the CQ */
+	setup.credits = 0;
+	setup.credit_thres = 0;
+	setup.ovfl_mode = 0;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static int
+cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid)
+{
+	u64 sge_cmd;
+	struct t3_modify_qp_wr *wqe;
+	struct mbuf *m = m_gethdr(MT_DATA, M_NOWAIT);
+	if (m == NULL) {
+		CTR1(KTR_IW_CXGB, "%s m_gethdr failed", __FUNCTION__);
+		return (-ENOMEM);
+	}
+	wqe = mtod(m, struct t3_modify_qp_wr *);
+	m->m_len = m->m_pkthdr.len = sizeof(*wqe);
+	memset(wqe, 0, sizeof(*wqe));
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0, qpid, 7);
+	wqe->flags = htobe32(MODQP_WRITE_EC);
+	sge_cmd = qpid << 8 | 3;
+	wqe->sge_cmd = htobe64(sge_cmd);
+	m_set_priority(m, CPL_PRIORITY_CONTROL);
+	m_set_sgl(m, NULL);
+	m_set_sgllen(m, 0);
+	return (cxgb_ofld_send(rdev_p->t3cdev_p, m));
+}
+
+int
+cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+	struct rdma_cq_setup setup;
+	int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe);
+	
+	cq->cqid = cxio_hal_get_cqid(rdev_p->rscp);
+	if (!cq->cqid)
+		return (-ENOMEM);
+	cq->sw_queue = malloc(size, M_DEVBUF, M_NOWAIT|M_ZERO);
+	if (!cq->sw_queue)
+		return (-ENOMEM);
+#if 0	
+	cq->queue = dma_alloc_coherent(rdev_p->rnic_info.pdev,
+					     (1UL << (cq->size_log2)) *
+					     sizeof(struct t3_cqe),
+					     &(cq->dma_addr), M_NOWAIT);
+#else
+	cq->queue = contigmalloc((1UL << (cq->size_log2))*sizeof(struct t3_cqe),
+	    M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0);
+	if (cq->queue)
+		cq->dma_addr = vtophys(cq->queue);
+	else {
+		free(cq->sw_queue, M_DEVBUF);
+		return (-ENOMEM);
+	}
+#endif
+	
+#ifdef notyet	
+	pci_unmap_addr_set(cq, mapping, cq->dma_addr);
+#endif
+	memset(cq->queue, 0, size);
+	setup.id = cq->cqid;
+	setup.base_addr = (u64) (cq->dma_addr);
+	setup.size = 1UL << cq->size_log2;
+	setup.credits = 65535;
+	setup.credit_thres = 1;
+	if (rdev_p->t3cdev_p->type != T3A)
+		setup.ovfl_mode = 0;
+	else
+		setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+int
+cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+	struct rdma_cq_setup setup;
+	setup.id = cq->cqid;
+	setup.base_addr = (u64) (cq->dma_addr);
+	setup.size = 1UL << cq->size_log2;
+	setup.credits = setup.size;
+	setup.credit_thres = setup.size;	/* TBD: overflow recovery */
+	setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static u32
+get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	struct cxio_qpid *entry;
+	u32 qpid;
+	int i;
+
+	mtx_lock(&uctx->lock);
+	if (!TAILQ_EMPTY(&uctx->qpids)) {
+		
+		entry = TAILQ_FIRST(&uctx->qpids);
+		TAILQ_REMOVE(&uctx->qpids, entry, entry);
+		qpid = entry->qpid;
+		free(entry, M_DEVBUF);
+	} else {
+		qpid = cxio_hal_get_qpid(rdev_p->rscp);
+		if (!qpid)
+			goto out;
+		for (i = qpid+1; i & rdev_p->qpmask; i++) {
+			entry = malloc(sizeof *entry, M_DEVBUF, M_NOWAIT);
+			if (!entry)
+				break;
+			entry->qpid = i;
+			TAILQ_INSERT_TAIL(&uctx->qpids, entry, entry);
+		}
+	}
+out:
+	mtx_unlock(&uctx->lock);
+	CTR2(KTR_IW_CXGB, "%s qpid 0x%x", __FUNCTION__, qpid);
+	return qpid;
+}
+
+static void
+put_qpid(struct cxio_rdev *rdev_p, u32 qpid,
+		     struct cxio_ucontext *uctx)
+{
+	struct cxio_qpid *entry;
+
+	entry = malloc(sizeof *entry, M_DEVBUF, M_NOWAIT);
+	CTR2(KTR_IW_CXGB, "%s qpid 0x%x", __FUNCTION__, qpid);
+	entry->qpid = qpid;
+	mtx_lock(&uctx->lock);
+	TAILQ_INSERT_TAIL(&uctx->qpids, entry, entry);
+	mtx_unlock(&uctx->lock);
+}
+
+void
+cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	struct cxio_qpid *pos, *tmp;
+
+	mtx_lock(&uctx->lock);
+	TAILQ_FOREACH_SAFE(pos, &uctx->qpids, entry, tmp) {
+		TAILQ_REMOVE(&uctx->qpids, pos, entry);
+		if (!(pos->qpid & rdev_p->qpmask))
+			cxio_hal_put_qpid(rdev_p->rscp, pos->qpid);
+		free(pos, M_DEVBUF);
+	}
+	mtx_unlock(&uctx->lock);
+}
+
+void
+cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	TAILQ_INIT(&uctx->qpids);
+	mtx_init(&uctx->lock, "cxio uctx", NULL, MTX_DEF|MTX_DUPOK);
+}
+
+int
+cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
+    struct t3_wq *wq, struct cxio_ucontext *uctx)
+{
+	int depth = 1UL << wq->size_log2;
+	int rqsize = 1UL << wq->rq_size_log2;
+
+	wq->qpid = get_qpid(rdev_p, uctx);
+	if (!wq->qpid)
+		return (-ENOMEM);
+
+	wq->rq = malloc(depth * sizeof(u64), M_DEVBUF, M_NOWAIT|M_ZERO);
+	if (!wq->rq)
+		goto err1;
+
+	wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize);
+	if (!wq->rq_addr)
+		goto err2;
+
+	wq->sq = malloc(depth * sizeof(struct t3_swsq), M_DEVBUF, M_NOWAIT|M_ZERO);
+	if (!wq->sq)
+		goto err3;
+#if 0
+	wq->queue = dma_alloc_coherent(rdev_p->rnic_info.pdev,
+					     depth * sizeof(union t3_wr),
+					     &(wq->dma_addr), M_NOWAIT);
+#else
+	wq->queue = contigmalloc(depth *sizeof(union t3_wr),
+	    M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0);
+	if (wq->queue)
+		wq->dma_addr = vtophys(wq->queue);
+
+#endif
+	if (!wq->queue)
+		goto err4;
+
+	memset(wq->queue, 0, depth * sizeof(union t3_wr));
+#ifdef notyet	
+	pci_unmap_addr_set(wq, mapping, wq->dma_addr);
+#endif
+	wq->doorbell = rdev_p->rnic_info.kdb_addr;
+	if (!kernel_domain)
+		wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
+					(wq->qpid << rdev_p->qpshift);
+	CTR4(KTR_IW_CXGB, "%s qpid 0x%x doorbell 0x%p udb 0x%llx", __FUNCTION__,
+	     wq->qpid, wq->doorbell, (unsigned long long) wq->udb);
+	return 0;
+err4:
+	free(wq->sq, M_DEVBUF);
+err3:
+	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize);
+err2:
+	free(wq->rq, M_DEVBUF);
+err1:
+	put_qpid(rdev_p, wq->qpid, uctx);
+	return (-ENOMEM);
+}
+
+int
+cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+	int err;
+	err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
+	free(cq->sw_queue, M_DEVBUF);
+#if 0	
+	dma_free_coherent(&(rdev_p->rnic_info.pdev),
+	    (1UL << (cq->size_log2))
+			  * sizeof(struct t3_cqe), cq->queue,
+	    /* pci_unmap_addr(cq, mapping)*/  0);
+#else
+	contigfree(cq->queue,(1UL << (cq->size_log2))
+	    * sizeof(struct t3_cqe), M_DEVBUF);
+#endif	
+	cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
+	return err;
+}
+
+int
+cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
+    struct cxio_ucontext *uctx)
+{
+
+#if 0
+	dma_free_coherent(&(rdev_p->rnic_info.pdev),
+			  (1UL << (wq->size_log2))
+			  * sizeof(union t3_wr), wq->queue,
+	    /* pci_unmap_addr(wq, mapping)*/ 0);
+#else	
+	contigfree(wq->queue, (1UL << (wq->size_log2))
+	    * sizeof(union t3_wr), M_DEVBUF);
+#endif	
+	free(wq->sq, M_DEVBUF);
+	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2));
+	free(wq->rq, M_DEVBUF);
+	put_qpid(rdev_p, wq->qpid, uctx);
+	return 0;
+}
+
+static void
+insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq)
+{
+	struct t3_cqe cqe;
+
+	CTR5(KTR_IW_CXGB, "%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x", __FUNCTION__,
+	     wq, cq, cq->sw_rptr, cq->sw_wptr);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = htobe32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+			         V_CQE_OPCODE(T3_SEND) |
+				 V_CQE_TYPE(0) |
+				 V_CQE_SWCQE(1) |
+				 V_CQE_QPID(wq->qpid) |
+				 V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+						       cq->size_log2)));
+	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+	cq->sw_wptr++;
+}
+
+void
+cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+	u32 ptr;
+
+	CTR3(KTR_IW_CXGB, "%s wq %p cq %p", __FUNCTION__, wq, cq);
+
+	/* flush RQ */
+	CTR4(KTR_IW_CXGB, "%s rq_rptr %u rq_wptr %u skip count %u", __FUNCTION__,
+	    wq->rq_rptr, wq->rq_wptr, count);
+	ptr = wq->rq_rptr + count;
+	while (ptr++ != wq->rq_wptr)
+		insert_recv_cqe(wq, cq);
+}
+
+static void
+insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
+		          struct t3_swsq *sqp)
+{
+	struct t3_cqe cqe;
+
+	CTR5(KTR_IW_CXGB, "%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x", __FUNCTION__,
+	     wq, cq, cq->sw_rptr, cq->sw_wptr);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = htobe32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+			         V_CQE_OPCODE(sqp->opcode) |
+			         V_CQE_TYPE(1) |
+			         V_CQE_SWCQE(1) |
+			         V_CQE_QPID(wq->qpid) |
+			         V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+						       cq->size_log2)));
+	cqe.u.scqe.wrid_hi = sqp->sq_wptr;
+
+	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+	cq->sw_wptr++;
+}
+
+void
+cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+	__u32 ptr;
+	struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2);
+
+	ptr = wq->sq_rptr + count;
+	sqp += count;
+	while (ptr != wq->sq_wptr) {
+		insert_sq_cqe(wq, cq, sqp);
+		sqp++;
+		ptr++;
+	}
+}
+
+/*
+ * Move all CQEs from the HWCQ into the SWCQ.
+ */
+void
+cxio_flush_hw_cq(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe, *swcqe;
+
+	CTR3(KTR_IW_CXGB, "%s cq %p cqid 0x%x", __FUNCTION__, cq, cq->cqid);
+	cqe = cxio_next_hw_cqe(cq);
+	while (cqe) {
+		CTR3(KTR_IW_CXGB, "%s flushing hwcq rptr 0x%x to swcq wptr 0x%x",
+		     __FUNCTION__, cq->rptr, cq->sw_wptr);
+		swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2);
+		*swcqe = *cqe;
+		swcqe->header |= htobe32(V_CQE_SWCQE(1));
+		cq->sw_wptr++;
+		cq->rptr++;
+		cqe = cxio_next_hw_cqe(cq);
+	}
+}
+
+static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq)
+{
+	if (CQE_OPCODE(*cqe) == T3_TERMINATE)
+		return 0;
+
+	if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe))
+		return 0;
+
+	if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe))
+		return 0;
+
+	if ((CQE_OPCODE(*cqe) == T3_SEND) && RQ_TYPE(*cqe) &&
+	    Q_EMPTY(wq->rq_rptr, wq->rq_wptr))
+		return 0;
+
+	return 1;
+}
+
+void
+cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+	struct t3_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	ptr = cq->sw_rptr;
+	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+		if ((SQ_TYPE(*cqe) || (CQE_OPCODE(*cqe) == T3_READ_RESP)) &&
+		    (CQE_QPID(*cqe) == wq->qpid))
+			(*count)++;
+		ptr++;
+	}
+	CTR3(KTR_IW_CXGB, "%s cq %p count %d", __FUNCTION__, cq, *count);
+}
+
+void
+cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+	struct t3_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	CTR2(KTR_IW_CXGB, "%s count zero %d", __FUNCTION__, *count);
+	ptr = cq->sw_rptr;
+	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+		if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) &&
+		    (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq))
+			(*count)++;
+		ptr++;
+	}
+	CTR3(KTR_IW_CXGB, "%s cq %p count %d", __FUNCTION__, cq, *count);
+}
+
+static int
+cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p)
+{
+	struct rdma_cq_setup setup;
+	setup.id = 0;
+	setup.base_addr = 0;	/* NULL address */
+	setup.size = 1;		/* enable the CQ */
+	setup.credits = 0;
+
+	/* force SGE to redirect to RspQ and interrupt */
+	setup.credit_thres = 0;
+	setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static int
+cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+	int err;
+	u64 sge_cmd, ctx0, ctx1;
+	u64 base_addr;
+	struct t3_modify_qp_wr *wqe;
+	struct mbuf *m;
+
+	m = m_gethdr(MT_DATA, M_NOWAIT);
+	if (m == NULL) {
+		CTR1(KTR_IW_CXGB, "%s m_gethdr failed", __FUNCTION__);
+		return (-ENOMEM);
+	}
+	err = cxio_hal_init_ctrl_cq(rdev_p);
+	if (err) {
+		CTR2(KTR_IW_CXGB, "%s err %d initializing ctrl_cq", __FUNCTION__, err);
+		goto err;
+	}
+#if 0	
+	rdev_p->ctrl_qp.workq = dma_alloc_coherent(
+		rdev_p->rnic_info.pdev,
+		    (1 << T3_CTRL_QP_SIZE_LOG2) *
+		    sizeof(union t3_wr),
+		    &(rdev_p->ctrl_qp.dma_addr),
+		    M_NOWAIT);
+#else
+	rdev_p->ctrl_qp.workq = contigmalloc((1 << T3_CTRL_QP_SIZE_LOG2) 
+	    *sizeof(union t3_wr), M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0);
+	if (rdev_p->ctrl_qp.workq)
+		rdev_p->ctrl_qp.dma_addr = vtophys(rdev_p->ctrl_qp.workq);
+
+#endif	
+	
+	if (!rdev_p->ctrl_qp.workq) {
+		CTR1(KTR_IW_CXGB, "%s dma_alloc_coherent failed", __FUNCTION__);
+		err = -ENOMEM;
+		goto err;
+	}
+#if 0	
+	pci_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
+			   rdev_p->ctrl_qp.dma_addr);
+#endif	
+	rdev_p->ctrl_qp.doorbell = (void /*__iomem */ *)rdev_p->rnic_info.kdb_addr;
+	memset(rdev_p->ctrl_qp.workq, 0,
+	       (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr));
+
+	mtx_init(&rdev_p->ctrl_qp.lock, "ctl-qp lock", NULL, MTX_DEF|MTX_DUPOK);
+
+	/* update HW Ctrl QP context */
+	base_addr = rdev_p->ctrl_qp.dma_addr;
+	base_addr >>= 12;
+	ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) |
+		V_EC_BASE_LO((u32) base_addr & 0xffff));
+	ctx0 <<= 32;
+	ctx0 |= V_EC_CREDITS(FW_WR_NUM);
+	base_addr >>= 16;
+	ctx1 = (u32) base_addr;
+	base_addr >>= 32;
+	ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) |
+			V_EC_TYPE(0) | V_EC_GEN(1) |
+			V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32;
+	wqe = mtod(m, struct t3_modify_qp_wr *);
+	m->m_len = m->m_pkthdr.len = sizeof(*wqe);
+	memset(wqe, 0, sizeof(*wqe));
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0,
+		       T3_CTL_QP_TID, 7);
+	wqe->flags = htobe32(MODQP_WRITE_EC);
+	sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3;
+	wqe->sge_cmd = htobe64(sge_cmd);
+	wqe->ctx1 = htobe64(ctx1);
+	wqe->ctx0 = htobe64(ctx0);
+	CTR3(KTR_IW_CXGB, "CtrlQP dma_addr 0x%llx workq %p size %d",
+	     (unsigned long long) rdev_p->ctrl_qp.dma_addr,
+	     rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2);
+	m_set_priority(m, CPL_PRIORITY_CONTROL);
+	m_set_sgl(m, NULL);
+	m_set_sgllen(m, 0);
+	return (cxgb_ofld_send(rdev_p->t3cdev_p, m));
+err:
+	m_free(m);
+	return err;
+}
+
+static int
+cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+#if 0
+	
+	dma_free_coherent(&(rdev_p->rnic_info.pdev),
+			  (1UL << T3_CTRL_QP_SIZE_LOG2)
+			  * sizeof(union t3_wr), rdev_p->ctrl_qp.workq,
+	    /* pci_unmap_addr(&rdev_p->ctrl_qp, mapping)*/ 0);
+#else
+	contigfree(rdev_p->ctrl_qp.workq,(1UL << T3_CTRL_QP_SIZE_LOG2)
+	    * sizeof(union t3_wr), M_DEVBUF);
+#endif
+	return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID);
+}
+
+/* write len bytes of data into addr (32B aligned address)
+ * If data is NULL, clear len byte of memory to zero.
+ * caller aquires the ctrl_qp lock before the call
+ */
+static int
+cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
+				      u32 len, void *data, int completion)
+{
+	u32 i, nr_wqe, copy_len;
+	u8 *copy_data;
+	u8 wr_len, utx_len;	/* lenght in 8 byte flit */
+	enum t3_wr_flags flag;
+	__be64 *wqe;
+	u64 utx_cmd;
+	addr &= 0x7FFFFFF;
+	nr_wqe = len % 96 ? len / 96 + 1 : len / 96;	/* 96B max per WQE */
+	CTR6(KTR_IW_CXGB, "cxio_hal_ctrl_qp_write_mem wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x",
+	     rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len,
+	     nr_wqe, data, addr);
+	utx_len = 3;		/* in 32B unit */
+	for (i = 0; i < nr_wqe; i++) {
+		if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr,
+		           T3_CTRL_QP_SIZE_LOG2)) {
+			CTR4(KTR_IW_CXGB, "%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, "
+			     "wait for more space i %d", __FUNCTION__,
+			     rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i);
+			if (cxio_wait(&rdev_p->ctrl_qp,
+				&rdev_p->ctrl_qp.lock, 
+				!Q_FULL(rdev_p->ctrl_qp.rptr, 
+					rdev_p->ctrl_qp.wptr,
+					T3_CTRL_QP_SIZE_LOG2))) {
+				CTR1(KTR_IW_CXGB, "%s ctrl_qp workq interrupted",
+				     __FUNCTION__);
+				return (-ERESTART);
+			}
+			CTR2(KTR_IW_CXGB, "%s ctrl_qp wakeup, continue posting work request "
+			     "i %d", __FUNCTION__, i);
+		}
+		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+						(1 << T3_CTRL_QP_SIZE_LOG2)));
+		flag = 0;
+		if (i == (nr_wqe - 1)) {
+			/* last WQE */
+			flag = completion ? T3_COMPLETION_FLAG : 0;
+			if (len % 32)
+				utx_len = len / 32 + 1;
+			else
+				utx_len = len / 32;
+		}
+
+		/*
+		 * Force a CQE to return the credit to the workq in case
+		 * we posted more than half the max QP size of WRs
+		 */
+		if ((i != 0) &&
+		    (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) {
+			flag = T3_COMPLETION_FLAG;
+			CTR2(KTR_IW_CXGB, "%s force completion at i %d", __FUNCTION__, i);
+		}
+
+		/* build the utx mem command */
+		wqe += (sizeof(struct t3_bypass_wr) >> 3);
+		utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3);
+		utx_cmd <<= 32;
+		utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1);
+		*wqe = htobe64(utx_cmd);
+		wqe++;
+		copy_data = (u8 *) data + i * 96;
+		copy_len = len > 96 ? 96 : len;
+
+		/* clear memory content if data is NULL */
+		if (data)
+			memcpy(wqe, copy_data, copy_len);
+		else
+			memset(wqe, 0, copy_len);
+		if (copy_len % 32)
+			memset(((u8 *) wqe) + copy_len, 0,
+			       32 - (copy_len % 32));
+		wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 +
+			 (utx_len << 2);
+		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+			      (1 << T3_CTRL_QP_SIZE_LOG2)));
+
+		/* wptr in the WRID[31:0] */
+		((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr;
+
+		/*
+		 * This must be the last write with a memory barrier
+		 * for the genbit
+		 */
+		build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag,
+			       Q_GENBIT(rdev_p->ctrl_qp.wptr,
+					T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID,
+			       wr_len);
+		if (flag == T3_COMPLETION_FLAG)
+			ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID);
+
+		len -= 96;
+		rdev_p->ctrl_qp.wptr++;
+	}
+	return 0;
+}
+
+/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl, and pbl_size
+ * OUT: stag index, actual pbl_size, pbl_addr allocated.
+ * TBD: shared memory region support
+ */
+static int
+__cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
+			 u32 *stag, u8 stag_state, u32 pdid,
+			 enum tpt_mem_type type, enum tpt_mem_perm perm,
+			 u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl,
+			 u32 *pbl_size, u32 *pbl_addr)
+{
+	int err;
+	struct tpt_entry tpt;
+	u32 stag_idx;
+	u32 wptr;
+	int rereg = (*stag != T3_STAG_UNSET);
+
+	stag_state = stag_state > 0;
+	stag_idx = (*stag) >> 8;
+
+	if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) {
+		stag_idx = cxio_hal_get_stag(rdev_p->rscp);
+		if (!stag_idx)
+			return (-ENOMEM);
+		*stag = (stag_idx << 8) | ((*stag) & 0xFF);
+	}
+	CTR5(KTR_IW_CXGB, "%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x",
+	     __FUNCTION__, stag_state, type, pdid, stag_idx);
+
+	if (reset_tpt_entry)
+		cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3);
+	else if (!rereg) {
+		*pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3);
+		if (!*pbl_addr) {
+			return (-ENOMEM);
+		}
+	}
+
+	mtx_lock(&rdev_p->ctrl_qp.lock);
+
+	/* write PBL first if any - update pbl only if pbl list exist */
+	if (pbl) {
+
+		CTR4(KTR_IW_CXGB, "%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d",
+		     __FUNCTION__, *pbl_addr, rdev_p->rnic_info.pbl_base,
+		     *pbl_size);
+		err = cxio_hal_ctrl_qp_write_mem(rdev_p,
+				(*pbl_addr >> 5),
+				(*pbl_size << 3), pbl, 0);
+		if (err)
+			goto ret;
+	}
+
+	/* write TPT entry */
+	if (reset_tpt_entry)
+		memset(&tpt, 0, sizeof(tpt));
+	else {
+		tpt.valid_stag_pdid = htobe32(F_TPT_VALID |
+				V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) |
+				V_TPT_STAG_STATE(stag_state) |
+				V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid));
+		PANIC_IF(page_size >= 28);
+		tpt.flags_pagesize_qpid = htobe32(V_TPT_PERM(perm) |
+				F_TPT_MW_BIND_ENABLE |
+				V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |
+				V_TPT_PAGE_SIZE(page_size));
+		tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 :
+				    htobe32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3));
+		tpt.len = htobe32(len);
+		tpt.va_hi = htobe32((u32) (to >> 32));
+		tpt.va_low_or_fbo = htobe32((u32) (to & 0xFFFFFFFFULL));
+		tpt.rsvd_bind_cnt_or_pstag = 0;
+		tpt.rsvd_pbl_size = reset_tpt_entry ? 0 :
+				  htobe32(V_TPT_PBL_SIZE((*pbl_size) >> 2));
+	}
+	err = cxio_hal_ctrl_qp_write_mem(rdev_p,
+				       stag_idx +
+				       (rdev_p->rnic_info.tpt_base >> 5),
+				       sizeof(tpt), &tpt, 1);
+
+	/* release the stag index to free pool */
+	if (reset_tpt_entry)
+		cxio_hal_put_stag(rdev_p->rscp, stag_idx);
+ret:
+	wptr = rdev_p->ctrl_qp.wptr;
+	mtx_unlock(&rdev_p->ctrl_qp.lock);
+	if (!err)
+		if (cxio_wait(&rdev_p->ctrl_qp, 
+			&rdev_p->ctrl_qp.lock,
+			SEQ32_GE(rdev_p->ctrl_qp.rptr, wptr)))
+			return (-ERESTART);
+	return err;
+}
+
+int
+cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, __be64 *pbl, u32 *pbl_size,
+			   u32 *pbl_addr)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+			     zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
+}
+
+int
+cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, __be64 *pbl, u32 *pbl_size,
+			   u32 *pbl_addr)
+{
+	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+			     zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
+}
+
+int
+cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size,
+		   u32 pbl_addr)
+{
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
+			     &pbl_size, &pbl_addr);
+}
+
+int
+cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid)
+{
+	u32 pbl_size = 0;
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0,
+			     NULL, &pbl_size, NULL);
+}
+
+int
+cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
+{
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
+			     NULL, NULL);
+}
+
+int
+cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
+{
+	struct t3_rdma_init_wr *wqe;
+	struct mbuf *m = m_gethdr(MT_DATA, M_NOWAIT);
+	if (m == NULL)
+		return (-ENOMEM);
+	CTR2(KTR_IW_CXGB, "%s rdev_p %p", __FUNCTION__, rdev_p);
+	wqe = mtod(m, struct t3_rdma_init_wr *);
+	m->m_len = m->m_pkthdr.len = sizeof(*wqe);
+	wqe->wrh.op_seop_flags = htobe32(V_FW_RIWR_OP(T3_WR_INIT));
+	wqe->wrh.gen_tid_len = htobe32(V_FW_RIWR_TID(attr->tid) |
+					   V_FW_RIWR_LEN(sizeof(*wqe) >> 3));
+	wqe->wrid.id1 = 0;
+	wqe->qpid = htobe32(attr->qpid);
+	wqe->pdid = htobe32(attr->pdid);
+	wqe->scqid = htobe32(attr->scqid);
+	wqe->rcqid = htobe32(attr->rcqid);
+	wqe->rq_addr = htobe32(attr->rq_addr - rdev_p->rnic_info.rqt_base);
+	wqe->rq_size = htobe32(attr->rq_size);
+	wqe->mpaattrs = attr->mpaattrs;
+	wqe->qpcaps = attr->qpcaps;
+	wqe->ulpdu_size = htobe16(attr->tcp_emss);
+	wqe->flags = htobe32(attr->flags);
+	wqe->ord = htobe32(attr->ord);
+	wqe->ird = htobe32(attr->ird);
+	wqe->qp_dma_addr = htobe64(attr->qp_dma_addr);
+	wqe->qp_dma_size = htobe32(attr->qp_dma_size);
+	wqe->irs = htobe32(attr->irs);
+	m_set_priority(m, 0);	/* 0=>ToeQ; 1=>CtrlQ */
+	m_set_sgl(m, NULL);
+	m_set_sgllen(m, 0);
+	return (cxgb_ofld_send(rdev_p->t3cdev_p, m));
+}
+
+void
+cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+	cxio_ev_cb = ev_cb;
+}
+
+void
+cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+	cxio_ev_cb = NULL;
+}
+
+static int
+cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct mbuf *m)
+{
+	static int cnt;
+	struct cxio_rdev *rdev_p = NULL;
+	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) m->m_data;
+	
+	CTR6(KTR_IW_CXGB, "%s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x",
+	     __FUNCTION__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg),
+	     RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg));
+	CTR4(KTR_IW_CXGB, "se %0x notify %0x cqbranch %0x creditth %0x",
+	     RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg),
+	     RSPQ_CREDIT_THRESH(rsp_msg));
+	CTR4(KTR_IW_CXGB, "CQE: QPID 0x%0x type 0x%0x status 0x%0x opcode %d",
+	     CQE_QPID(rsp_msg->cqe), 
+	     CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+	     CQE_OPCODE(rsp_msg->cqe));
+	CTR3(KTR_IW_CXGB, "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x",
+	     CQE_LEN(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+	rdev_p = (struct cxio_rdev *)t3cdev_p->ulp;
+	if (!rdev_p) {
+		CTR2(KTR_IW_CXGB, "%s called by t3cdev %p with null ulp", __FUNCTION__,
+		     t3cdev_p);
+		return 0;
+	}
+	if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) {
+		mtx_lock(&rdev_p->ctrl_qp.lock);
+		rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1;
+		wakeup(&rdev_p->ctrl_qp);
+		mtx_unlock(&rdev_p->ctrl_qp.lock);
+		m_free(m);
+	} else if (CQE_QPID(rsp_msg->cqe) == 0xfff8)
+		m_free(m);
+	else if (cxio_ev_cb)
+		(*cxio_ev_cb) (rdev_p, m);
+	else
+		m_free(m);
+	cnt++;
+	return 0;
+}
+
+/* Caller takes care of locking if needed */
+int
+cxio_rdev_open(struct cxio_rdev *rdev_p)
+{
+	struct ifnet *ifp;
+	int err = 0;
+
+	if (strlen(rdev_p->dev_name)) {
+		if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) {
+			return (-EBUSY);
+		}
+		ifp = rdev_p->ifp; 
+		if (ifp == NULL) 
+			return (-EINVAL);
+		if_free(ifp);
+	} else if (rdev_p->t3cdev_p) {
+		if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) 
+			return (-EBUSY);
+		ifp = rdev_p->t3cdev_p->lldev;
+		strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name,
+			T3_MAX_DEV_NAME_LEN);
+	} else {
+		CTR1(KTR_IW_CXGB, "%s t3cdev_p or dev_name must be set", __FUNCTION__);
+		return (-EINVAL);
+	}
+
+	TAILQ_INSERT_TAIL(&rdev_list, rdev_p, entry);
+
+	CTR2(KTR_IW_CXGB, "%s opening rnic dev %s", __FUNCTION__, rdev_p->dev_name);
+	memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp));
+	if (!rdev_p->t3cdev_p)
+		rdev_p->t3cdev_p = T3CDEV(ifp);
+	rdev_p->t3cdev_p->ulp = (void *) rdev_p;
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS,
+					 &(rdev_p->rnic_info));
+	if (err) {
+		log(LOG_ERR, "%s t3cdev_p(%p)->ctl returned error %d.\n",
+		     __FUNCTION__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS,
+				    &(rdev_p->port_info));
+	if (err) {
+		log(LOG_ERR, "%s t3cdev_p(%p)->ctl returned error %d.\n",
+		     __FUNCTION__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+
+	/*
+	 * qpshift is the number of bits to shift the qpid left in order
+	 * to get the correct address of the doorbell for that qp.
+	 */
+	cxio_init_ucontext(rdev_p, &rdev_p->uctx);
+	rdev_p->qpshift = PAGE_SHIFT -
+			  ilog2(65536 >>
+			            ilog2(rdev_p->rnic_info.udbell_len >>
+					      PAGE_SHIFT));
+	rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT;
+	rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1;
+	CTR4(KTR_IW_CXGB, "cxio_rdev_open rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d",
+	     rdev_p->dev_name, rdev_p->rnic_info.tpt_base,
+	     rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p));
+	CTR4(KTR_IW_CXGB, "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x",
+	     rdev_p->rnic_info.pbl_base,
+	     rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base,
+	     rdev_p->rnic_info.rqt_top);
+	CTR6(KTR_IW_CXGB, "udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu "
+	     "qpnr %d qpmask 0x%x",
+	     rdev_p->rnic_info.udbell_len,
+	     rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr,
+	     rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask);
+
+	err = cxio_hal_init_ctrl_qp(rdev_p);
+	if (err) {
+		log(LOG_ERR, "%s error %d initializing ctrl_qp.\n",
+		       __FUNCTION__, err);
+		goto err1;
+	}
+	err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0,
+				     0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ,
+				     T3_MAX_NUM_PD);
+	if (err) {
+		log(LOG_ERR, "%s error %d initializing hal resources.\n",
+		       __FUNCTION__, err);
+		goto err2;
+	}
+	err = cxio_hal_pblpool_create(rdev_p);
+	if (err) {
+		log(LOG_ERR, "%s error %d initializing pbl mem pool.\n",
+		       __FUNCTION__, err);
+		goto err3;
+	}
+	err = cxio_hal_rqtpool_create(rdev_p);
+	if (err) {
+		log(LOG_ERR, "%s error %d initializing rqt mem pool.\n",
+		       __FUNCTION__, err);
+		goto err4;
+	}
+	return 0;
+err4:
+	cxio_hal_pblpool_destroy(rdev_p);
+err3:
+	cxio_hal_destroy_resource(rdev_p->rscp);
+err2:
+	cxio_hal_destroy_ctrl_qp(rdev_p);
+err1:
+	TAILQ_REMOVE(&rdev_list, rdev_p, entry);
+	return err;
+}
+
+void
+cxio_rdev_close(struct cxio_rdev *rdev_p)
+{
+	if (rdev_p) {
+		cxio_hal_pblpool_destroy(rdev_p);
+		cxio_hal_rqtpool_destroy(rdev_p);
+		TAILQ_REMOVE(&rdev_list, rdev_p, entry);
+		rdev_p->t3cdev_p->ulp = NULL;
+		cxio_hal_destroy_ctrl_qp(rdev_p);
+		cxio_hal_destroy_resource(rdev_p->rscp);
+	}
+}
+
+int
+cxio_hal_init(void)
+{
+	TAILQ_INIT(&rdev_list);
+#ifdef needed
+	if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI))
+		return (-ENOMEM);
+#endif
+	t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler);
+	return 0;
+}
+
+void
+cxio_hal_exit(void)
+{
+	struct cxio_rdev *rdev, *tmp;
+
+	t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL);
+	TAILQ_FOREACH_SAFE(rdev, &rdev_list, entry, tmp)
+		cxio_rdev_close(rdev);
+#ifdef needed
+	cxio_hal_destroy_rhdl_resource();
+#endif
+}
+
+static void
+flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq)
+{
+	struct t3_swsq *sqp;
+	__u32 ptr = wq->sq_rptr;
+	int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr);
+
+	sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+	while (count--)
+		if (!sqp->signaled) {
+			ptr++;
+			sqp = wq->sq + Q_PTR2IDX(ptr,  wq->sq_size_log2);
+		} else if (sqp->complete) {
+
+			/*
+			 * Insert this completed cqe into the swcq.
+			 */
+			CTR3(KTR_IW_CXGB, "%s moving cqe into swcq sq idx %ld cq idx %ld",
+			     __FUNCTION__, Q_PTR2IDX(ptr,  wq->sq_size_log2),
+			     Q_PTR2IDX(cq->sw_wptr, cq->size_log2));
+			sqp->cqe.header |= htonl(V_CQE_SWCQE(1));
+			*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2))
+				= sqp->cqe;
+			cq->sw_wptr++;
+			sqp->signaled = 0;
+			break;
+		} else
+			break;
+}
+
+static void
+create_read_req_cqe(struct t3_wq *wq, struct t3_cqe *hw_cqe,
+				struct t3_cqe *read_cqe)
+{
+	read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr;
+	read_cqe->len = wq->oldest_read->read_len;
+	read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) |
+				 V_CQE_SWCQE(SW_CQE(*hw_cqe)) |
+				 V_CQE_OPCODE(T3_READ_REQ) |
+				 V_CQE_TYPE(1));
+}
+
+/*
+ * Return a ptr to the next read wr in the SWSQ or NULL.
+ */
+static void
+advance_oldest_read(struct t3_wq *wq)
+{
+
+	u32 rptr = wq->oldest_read - wq->sq + 1;
+	u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2);
+
+	while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) {
+		wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2);
+
+		if (wq->oldest_read->opcode == T3_READ_REQ)
+			return;
+		rptr++;
+	}
+	wq->oldest_read = NULL;
+}
+
+/*
+ * cxio_poll_cq
+ *
+ * Caller must:
+ *     check the validity of the first CQE,
+ *     supply the wq assicated with the qpid.
+ *
+ * credit: cq credit to return to sge.
+ * cqe_flushed: 1 iff the CQE is flushed.
+ * cqe: copy of the polled CQE.
+ *
+ * return value:
+ *     0       CQE returned,
+ *    -1       CQE skipped, try again.
+ */
+int
+cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+		     u8 *cqe_flushed, u64 *cookie, u32 *credit)
+{
+	int ret = 0;
+	struct t3_cqe *hw_cqe, read_cqe;
+
+	*cqe_flushed = 0;
+	*credit = 0;
+	hw_cqe = cxio_next_cqe(cq);
+
+	CTR5(KTR_IW_CXGB, "cxio_poll_cq CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x",
+	     CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe),
+	     CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe));
+	CTR4(KTR_IW_CXGB, "opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x",
+	     CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe),
+	     CQE_WRID_LOW(*hw_cqe));
+
+	/*
+	 * skip cqe's not affiliated with a QP.
+	 */
+	if (wq == NULL) {
+		ret = -1;
+		goto skip_cqe;
+	}
+
+	/*
+	 * Gotta tweak READ completions:
+	 *	1) the cqe doesn't contain the sq_wptr from the wr.
+	 *	2) opcode not reflected from the wr.
+	 *	3) read_len not reflected from the wr.
+	 *	4) cq_type is RQ_TYPE not SQ_TYPE.
+	 */
+	if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) {
+
+		/*
+		 * Don't write to the HWCQ, so create a new read req CQE
+		 * in local memory.
+		 */
+		create_read_req_cqe(wq, hw_cqe, &read_cqe);
+		hw_cqe = &read_cqe;
+		advance_oldest_read(wq);
+	}
+
+	/*
+	 * T3A: Discard TERMINATE CQEs.
+	 */
+	if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) {
+		ret = -1;
+		wq->error = 1;
+		goto skip_cqe;
+	}
+
+	if (CQE_STATUS(*hw_cqe) || wq->error) {
+		*cqe_flushed = wq->error;
+		wq->error = 1;
+
+		/*
+		 * T3A inserts errors into the CQE.  We cannot return
+		 * these as work completions.
+		 */
+		/* incoming write failures */
+		if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE)
+		     && RQ_TYPE(*hw_cqe)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+		/* incoming read request failures */
+		if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		/* incoming SEND with no receive posted failures */
+		if ((CQE_OPCODE(*hw_cqe) == T3_SEND) && RQ_TYPE(*hw_cqe) &&
+		    Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+		goto proc_cqe;
+	}
+
+	/*
+	 * RECV completion.
+	 */
+	if (RQ_TYPE(*hw_cqe)) {
+
+		/*
+		 * HW only validates 4 bits of MSN.  So we must validate that
+		 * the MSN in the SEND is the next expected MSN.  If its not,
+		 * then we complete this with TPT_ERR_MSN and mark the wq in
+		 * error.
+		 */
+		if (__predict_false((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) {
+			wq->error = 1;
+			hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN));
+			goto proc_cqe;
+		}
+		goto proc_cqe;
+	}
+
+	/*
+	 * If we get here its a send completion.
+	 *
+	 * Handle out of order completion. These get stuffed
+	 * in the SW SQ. Then the SW SQ is walked to move any
+	 * now in-order completions into the SW CQ.  This handles
+	 * 2 cases:
+	 *	1) reaping unsignaled WRs when the first subsequent
+	 *	   signaled WR is completed.
+	 *	2) out of order read completions.
+	 */
+	if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) {
+		struct t3_swsq *sqp;
+
+		CTR2(KTR_IW_CXGB, "%s out of order completion going in swsq at idx %ld",
+		     __FUNCTION__,
+		     Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2));
+		sqp = wq->sq +
+		      Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2);
+		sqp->cqe = *hw_cqe;
+		sqp->complete = 1;
+		ret = -1;
+		goto flush_wq;
+	}
+
+proc_cqe:
+	*cqe = *hw_cqe;
+
+	/*
+	 * Reap the associated WR(s) that are freed up with this
+	 * completion.
+	 */
+	if (SQ_TYPE(*hw_cqe)) {
+		wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe);
+		CTR2(KTR_IW_CXGB, "%s completing sq idx %ld", __FUNCTION__,
+		     Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2));
+		*cookie = (wq->sq +
+			   Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2))->wr_id;
+		wq->sq_rptr++;
+	} else {
+		CTR2(KTR_IW_CXGB, "%s completing rq idx %ld", __FUNCTION__,
+		     Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+		*cookie = *(wq->rq + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+		wq->rq_rptr++;
+	}
+
+flush_wq:
+	/*
+	 * Flush any completed cqes that are now in-order.
+	 */
+	flush_completed_wrs(wq, cq);
+
+skip_cqe:
+	if (SW_CQE(*hw_cqe)) {
+		CTR4(KTR_IW_CXGB, "%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x",
+		     __FUNCTION__, cq, cq->cqid, cq->sw_rptr);
+		++cq->sw_rptr;
+	} else {
+		CTR4(KTR_IW_CXGB, "%s cq %p cqid 0x%x skip hw cqe rptr 0x%x",
+		     __FUNCTION__, cq, cq->cqid, cq->rptr);
+		++cq->rptr;
+
+		/*
+		 * T3A: compute credits.
+		 */
+		if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1)))
+		    || ((cq->rptr - cq->wptr) >= 128)) {
+			*credit = cq->rptr - cq->wptr;
+			cq->wptr = cq->rptr;
+		}
+	}
+	return ret;
+}
+
+
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h
new file mode 100644
index 0000000000000..6a401e09322d7
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h
@@ -0,0 +1,330 @@
+/**************************************************************************
+
+Copyright (c) 2007, 2008 Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef  __CXIO_HAL_H__
+#define  __CXIO_HAL_H__
+#include <sys/condvar.h>
+#include <sys/ktr.h>
+
+#define T3_CTRL_QP_ID    FW_RI_SGEEC_START
+#define T3_CTL_QP_TID	 FW_RI_TID_START
+#define T3_CTRL_QP_SIZE_LOG2  8
+#define T3_CTRL_CQ_ID    0
+
+/* TBD */
+#define T3_MAX_NUM_RI (1<<15)
+#define T3_MAX_NUM_QP (1<<15)
+#define T3_MAX_NUM_CQ (1<<15)
+#define T3_MAX_NUM_PD (1<<15)
+#define T3_MAX_PBL_SIZE 256
+#define T3_MAX_RQ_SIZE 1024
+#define T3_MAX_NUM_STAG (1<<15)
+
+#define T3_STAG_UNSET 0xffffffff
+
+#define T3_MAX_DEV_NAME_LEN 32
+
+struct cxio_hal_ctrl_qp {
+	u32 wptr;
+	u32 rptr;
+	struct mtx lock;	/* for the wtpr, can sleep */
+#ifdef notyet
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+#endif	
+	union t3_wr *workq;	/* the work request queue */
+	bus_addr_t dma_addr;	/* pci bus address of the workq */
+	void /* __iomem */ *doorbell;
+};
+
+struct cxio_hal_resource {
+	struct buf_ring *tpt_fifo;
+	struct mtx tpt_fifo_lock;
+	struct buf_ring *qpid_fifo;
+	struct mtx qpid_fifo_lock;
+	struct buf_ring *cqid_fifo;
+	struct mtx cqid_fifo_lock;
+	struct buf_ring *pdid_fifo;
+	struct mtx pdid_fifo_lock;
+};
+
+struct cxio_qpid {
+	TAILQ_ENTRY(cxio_qpid) entry;
+	u32 qpid;
+};
+
+struct cxio_ucontext {
+	TAILQ_HEAD(, cxio_qpid) qpids;
+	struct mtx lock;
+};
+
+struct cxio_rdev {
+	char dev_name[T3_MAX_DEV_NAME_LEN];
+	struct t3cdev *t3cdev_p;
+	struct rdma_info rnic_info;
+	struct adap_ports port_info;
+	struct cxio_hal_resource *rscp;
+	struct cxio_hal_ctrl_qp ctrl_qp;
+	void *ulp;
+	unsigned long qpshift;
+	u32 qpnr;
+	u32 qpmask;
+	struct cxio_ucontext uctx;
+	struct gen_pool *pbl_pool;
+	struct gen_pool *rqt_pool;
+	struct ifnet *ifp;
+	TAILQ_ENTRY(cxio_rdev) entry;
+};
+
+static __inline int
+cxio_num_stags(struct cxio_rdev *rdev_p)
+{
+	return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5));
+}
+
+typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p,
+					     struct mbuf * m);
+
+#define RSPQ_CQID(rsp) (be32toh(rsp->cq_ptrid) & 0xffff)
+#define RSPQ_CQPTR(rsp) ((be32toh(rsp->cq_ptrid) >> 16) & 0xffff)
+#define RSPQ_GENBIT(rsp) ((be32toh(rsp->flags) >> 16) & 1)
+#define RSPQ_OVERFLOW(rsp) ((be32toh(rsp->flags) >> 17) & 1)
+#define RSPQ_AN(rsp) ((be32toh(rsp->flags) >> 18) & 1)
+#define RSPQ_SE(rsp) ((be32toh(rsp->flags) >> 19) & 1)
+#define RSPQ_NOTIFY(rsp) ((be32toh(rsp->flags) >> 20) & 1)
+#define RSPQ_CQBRANCH(rsp) ((be32toh(rsp->flags) >> 21) & 1)
+#define RSPQ_CREDIT_THRESH(rsp) ((be32toh(rsp->flags) >> 22) & 1)
+
+struct respQ_msg_t {
+	__be32 flags;		/* flit 0 */
+	__be32 cq_ptrid;
+	__be64 rsvd;		/* flit 1 */
+	struct t3_cqe cqe;	/* flits 2-3 */
+};
+
+enum t3_cq_opcode {
+	CQ_ARM_AN = 0x2,
+	CQ_ARM_SE = 0x6,
+	CQ_FORCE_AN = 0x3,
+	CQ_CREDIT_UPDATE = 0x7
+};
+
+int cxio_rdev_open(struct cxio_rdev *rdev);
+void cxio_rdev_close(struct cxio_rdev *rdev);
+int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
+		   enum t3_cq_opcode op, u32 credit);
+int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
+		   struct cxio_ucontext *uctx);
+int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq,
+		    struct cxio_ucontext *uctx);
+int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode);
+int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, __be64 *pbl, u32 *pbl_size,
+			   u32 *pbl_addr);
+int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, __be64 *pbl, u32 *pbl_size,
+			   u32 *pbl_addr);
+int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
+		   u32 pbl_addr);
+int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
+int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag);
+int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr);
+void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp);
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid);
+int cxio_hal_init(void);
+void cxio_hal_exit(void);
+void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
+void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
+void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_flush_hw_cq(struct t3_cq *cq);
+int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+		     u8 *cqe_flushed, u64 *cookie, u32 *credit);
+
+#define MOD "iw_cxgb: "
+
+#ifdef DEBUG
+void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag);
+void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint32_t len, u8 shift);
+void cxio_dump_wqe(union t3_wr *wqe);
+void cxio_dump_wce(struct t3_cqe *wce);
+void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents);
+void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid);
+#endif
+
+
+ static unsigned char hiBitSetTab[] = {
+    0, 1, 2, 2, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6,
+    7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7
+
+};
+
+
+static __inline
+int ilog2(unsigned long val)
+{
+    unsigned long   tmp;
+
+    tmp = val >> 24;
+    if (tmp) {
+        return hiBitSetTab[tmp] + 23;
+    }
+    tmp = (val >> 16) & 0xff;
+    if (tmp) {
+        return hiBitSetTab[tmp] + 15;
+    }
+    tmp = (val >> 8) & 0xff;
+    if (tmp) {
+        return hiBitSetTab[tmp] + 7;
+
+    }
+    return hiBitSetTab[val & 0xff] - 1;
+} 
+
+#define cxfree(a) free((a), M_DEVBUF);
+#define kmalloc(a, b) malloc((a), M_DEVBUF, (b))
+#define kzalloc(a, b) malloc((a), M_DEVBUF, (b)|M_ZERO)
+
+static __inline __attribute__((const))
+unsigned long roundup_pow_of_two(unsigned long n)
+{
+	return 1UL << flsl(n - 1);
+}
+
+#define PAGE_ALIGN(x) roundup2((x), PAGE_SIZE)
+
+#include <sys/blist.h>
+struct gen_pool {
+	blist_t  	gen_list;
+	daddr_t  	gen_base;
+	int      	gen_chunk_shift;
+	struct mtx 	gen_lock;
+};
+
+static __inline struct gen_pool *
+gen_pool_create(daddr_t base, u_int chunk_shift, u_int len)
+{
+	struct gen_pool *gp;
+
+	gp = malloc(sizeof(struct gen_pool), M_DEVBUF, M_NOWAIT);
+	if (gp == NULL)
+		return (NULL);
+	
+	gp->gen_list = blist_create(len >> chunk_shift, M_NOWAIT);
+	if (gp->gen_list == NULL) {
+		free(gp, M_DEVBUF);
+		return (NULL);
+	}
+	blist_free(gp->gen_list, 0, len >> chunk_shift);
+	gp->gen_base = base;
+	gp->gen_chunk_shift = chunk_shift;
+	mtx_init(&gp->gen_lock, "genpool", NULL, MTX_DUPOK|MTX_DEF);
+
+	return (gp);
+}
+
+static __inline unsigned long
+gen_pool_alloc(struct gen_pool *gp, int size)
+{
+	int chunks;
+	daddr_t blkno; 
+
+	chunks = (size + (1<<gp->gen_chunk_shift) - 1) >> gp->gen_chunk_shift;
+	mtx_lock(&gp->gen_lock);
+	blkno = blist_alloc(gp->gen_list, chunks);
+	mtx_unlock(&gp->gen_lock);
+
+	if (blkno == SWAPBLK_NONE)
+		return (0);
+
+	return (gp->gen_base + ((1 << gp->gen_chunk_shift) * blkno));
+}
+
+static __inline void
+gen_pool_free(struct gen_pool *gp, daddr_t address, int size)
+{
+	int chunks;
+	daddr_t blkno;
+	
+	chunks = (size + (1<<gp->gen_chunk_shift) - 1) >> gp->gen_chunk_shift;
+	blkno = (address - gp->gen_base) / (1 << gp->gen_chunk_shift);
+	mtx_lock(&gp->gen_lock);
+	blist_free(gp->gen_list, blkno, chunks);
+	mtx_unlock(&gp->gen_lock);
+}
+
+static __inline void
+gen_pool_destroy(struct gen_pool *gp)
+{
+	blist_destroy(gp->gen_list);
+	free(gp, M_DEVBUF);
+}
+
+#define cxio_wait(ctx, lockp, cond) \
+({ \
+	int __ret = 0; \
+	mtx_lock(lockp); \
+	while (!cond) { \
+                msleep(ctx, lockp, 0, "cxio_wait", hz); \
+                if (SIGPENDING(curthread)) { \
+			__ret = ERESTART; \
+                        break; \
+                } \
+	} \
+	mtx_unlock(lockp); \
+	__ret; \
+}) 
+extern struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev);
+
+#define KTR_IW_CXGB KTR_SPARE4
+
+#endif
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c
new file mode 100644
index 0000000000000..df06f87a9805c
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c
@@ -0,0 +1,219 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/libkern.h>
+
+#include <netinet/in.h>
+
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/ib_umem.h>
+#include <contrib/rdma/ib_user_verbs.h>
+
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#include <ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <ulp/iw_cxgb/iw_cxgb_user.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h>
+#endif
+
+
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+					struct iwch_mr *mhp,
+					int shift,
+					__be64 *page_list)
+{
+	u32 stag;
+	u32 mmid;
+
+
+	if (cxio_register_phys_mem(&rhp->rdev,
+				   &stag, mhp->attr.pdid,
+				   mhp->attr.perms,
+				   mhp->attr.zbva,
+				   mhp->attr.va_fbo,
+				   mhp->attr.len,
+				   shift-12,
+				   page_list,
+				   &mhp->attr.pbl_size, &mhp->attr.pbl_addr))
+		return (-ENOMEM);
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+	CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __FUNCTION__, mmid, mhp);
+	return 0;
+}
+
+int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+					struct iwch_mr *mhp,
+					int shift,
+					__be64 *page_list,
+					int npages)
+{
+	u32 stag;
+	u32 mmid;
+
+
+	/* We could support this... */
+	if (npages > mhp->attr.pbl_size)
+		return (-ENOMEM);
+
+	stag = mhp->attr.stag;
+	if (cxio_reregister_phys_mem(&rhp->rdev,
+				   &stag, mhp->attr.pdid,
+				   mhp->attr.perms,
+				   mhp->attr.zbva,
+				   mhp->attr.va_fbo,
+				   mhp->attr.len,
+				   shift-12,
+				   page_list,
+				   &mhp->attr.pbl_size, &mhp->attr.pbl_addr))
+		return (-ENOMEM);
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+	CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __FUNCTION__, mmid, mhp);
+	return 0;
+}
+
+int build_phys_page_list(struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					u64 *iova_start,
+					u64 *total_size,
+					int *npages,
+					int *shift,
+					__be64 **page_list)
+{
+	u64 mask;
+	int i, j, n;
+
+	mask = 0;
+	*total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
+			return (-EINVAL);
+		if (i != 0 && i != num_phys_buf - 1 &&
+		    (buffer_list[i].size & ~PAGE_MASK))
+			return (-EINVAL);
+		*total_size += buffer_list[i].size;
+		if (i > 0)
+			mask |= buffer_list[i].addr;
+		else
+			mask |= buffer_list[i].addr & PAGE_MASK;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+		else
+			mask |= (buffer_list[i].addr + buffer_list[i].size +
+				PAGE_SIZE - 1) & PAGE_MASK;
+	}
+
+	if (*total_size > 0xFFFFFFFFULL)
+		return (-ENOMEM);
+
+	/* Find largest page shift we can use to cover buffers */
+	for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
+		if ((1ULL << *shift) & mask)
+			break;
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
+	buffer_list[0].addr &= ~0ull << *shift;
+
+	*npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		*npages += (buffer_list[i].size +
+			(1ULL << *shift) - 1) >> *shift;
+
+	if (!*npages)
+		return (-EINVAL);
+
+	*page_list = kmalloc(sizeof(u64) * *npages, M_NOWAIT);
+	if (!*page_list)
+		return (-ENOMEM);
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
+		     ++j)
+			(*page_list)[n++] = htobe64(buffer_list[i].addr +
+			    ((u64) j << *shift));
+
+	CTR6(KTR_IW_CXGB, "%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d",
+	     __FUNCTION__, (unsigned long long) *iova_start,
+	     (unsigned long long) mask, *shift, (unsigned long long) *total_size,
+	     *npages);
+
+	return 0;
+
+}
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c
new file mode 100644
index 0000000000000..4ef7dc5f8c979
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c
@@ -0,0 +1,1295 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+
+#include <netinet/in.h>
+
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/ib_umem.h>
+#include <contrib/rdma/ib_user_verbs.h>
+
+
+
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#include <ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <ulp/iw_cxgb/iw_cxgb_user.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h>
+#endif
+
+
+static int
+iwch_modify_port(struct ib_device *ibdev,
+			    u8 port, int port_modify_mask,
+			    struct ib_port_modify *props)
+{
+	return (-ENOSYS);
+}
+
+static struct ib_ah *
+iwch_ah_create(struct ib_pd *pd,
+				    struct ib_ah_attr *ah_attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static int
+iwch_ah_destroy(struct ib_ah *ah)
+{
+	return (-ENOSYS);
+}
+
+static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return (-ENOSYS);
+}
+
+static int
+iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return (-ENOSYS);
+}
+
+static int
+iwch_process_mad(struct ib_device *ibdev,
+			    int mad_flags,
+			    u8 port_num,
+			    struct ib_wc *in_wc,
+			    struct ib_grh *in_grh,
+			    struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	return (-ENOSYS);
+}
+
+static int
+iwch_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct iwch_dev *rhp = to_iwch_dev(context->device);
+	struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
+	struct iwch_mm_entry *mm, *tmp;
+
+	CTR2(KTR_IW_CXGB, "%s context %p", __FUNCTION__, context);
+	TAILQ_FOREACH_SAFE(mm, &ucontext->mmaps, entry, tmp) {
+		TAILQ_REMOVE(&ucontext->mmaps, mm, entry);
+		cxfree(mm);
+	}
+	cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
+	cxfree(ucontext);
+	return 0;
+}
+
+static struct ib_ucontext *
+iwch_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata)
+{
+	struct iwch_ucontext *context;
+	struct iwch_dev *rhp = to_iwch_dev(ibdev);
+
+	CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev);
+	context = malloc(sizeof(*context), M_DEVBUF, M_ZERO|M_NOWAIT);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+	cxio_init_ucontext(&rhp->rdev, &context->uctx);
+	TAILQ_INIT(&context->mmaps);
+	mtx_init(&context->mmap_lock, "ucontext mmap", NULL, MTX_DEF);
+	return &context->ibucontext;
+}
+
+static int
+iwch_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct iwch_cq *chp;
+
+	CTR2(KTR_IW_CXGB, "%s ib_cq %p", __FUNCTION__, ib_cq);
+	chp = to_iwch_cq(ib_cq);
+
+	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	mtx_lock(&chp->lock);
+	if (--chp->refcnt)
+		msleep(chp, &chp->lock, 0, "iwch_destroy_cq", 0);
+	mtx_unlock(&chp->lock);
+
+	cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
+	cxfree(chp);
+	return 0;
+}
+
+static struct ib_cq *
+iwch_create_cq(struct ib_device *ibdev, int entries, int vector,
+			     struct ib_ucontext *ib_context,
+			     struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	struct iwch_create_cq_resp uresp;
+	struct iwch_create_cq_req ureq;
+	struct iwch_ucontext *ucontext = NULL;
+
+	CTR3(KTR_IW_CXGB, "%s ib_dev %p entries %d", __FUNCTION__, ibdev, entries);
+	rhp = to_iwch_dev(ibdev);
+	chp = malloc(sizeof(*chp), M_DEVBUF, M_NOWAIT|M_ZERO);
+	if (!chp) {
+		return ERR_PTR(-ENOMEM);
+	}
+	if (ib_context) {
+		ucontext = to_iwch_ucontext(ib_context);
+		if (!t3a_device(rhp)) {
+			if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) {
+				cxfree(chp);
+				return ERR_PTR(-EFAULT);
+			}
+			chp->user_rptr_addr = (u32 /*__user */*)(unsigned long)ureq.user_rptr_addr;
+		}
+	}
+
+	if (t3a_device(rhp)) {
+
+		/*
+		 * T3A: Add some fluff to handle extra CQEs inserted
+		 * for various errors.
+		 * Additional CQE possibilities:
+		 *      TERMINATE,
+		 *      incoming RDMA WRITE Failures
+		 *      incoming RDMA READ REQUEST FAILUREs
+		 * NOTE: We cannot ensure the CQ won't overflow.
+		 */
+		entries += 16;
+	}
+	entries = roundup_pow_of_two(entries);
+	chp->cq.size_log2 = ilog2(entries);
+
+	if (cxio_create_cq(&rhp->rdev, &chp->cq)) {
+		cxfree(chp);
+		return ERR_PTR(-ENOMEM);
+	}
+	chp->rhp = rhp;
+	chp->ibcq.cqe = 1 << chp->cq.size_log2;
+	mtx_init(&chp->lock, "cxgb cq", NULL, MTX_DEF|MTX_DUPOK);
+	chp->refcnt = 1;
+	insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
+
+	if (ucontext) {
+		struct iwch_mm_entry *mm;
+
+		mm = kmalloc(sizeof *mm, M_NOWAIT);
+		if (!mm) {
+			iwch_destroy_cq(&chp->ibcq);
+			return ERR_PTR(-ENOMEM);
+		}
+		uresp.cqid = chp->cq.cqid;
+		uresp.size_log2 = chp->cq.size_log2;
+		mtx_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		mtx_unlock(&ucontext->mmap_lock);
+		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+			cxfree(mm);
+			iwch_destroy_cq(&chp->ibcq);
+			return ERR_PTR(-EFAULT);
+		}
+		mm->key = uresp.key;
+		mm->addr = vtophys(chp->cq.queue);
+		mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
+					     sizeof (struct t3_cqe));
+		insert_mmap(ucontext, mm);
+	}
+	CTR4(KTR_IW_CXGB, "created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx",
+	     chp->cq.cqid, chp, (1 << chp->cq.size_log2),
+	     (unsigned long long) chp->cq.dma_addr);
+	return &chp->ibcq;
+}
+
+static int
+iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+#ifdef notyet
+	struct iwch_cq *chp = to_iwch_cq(cq);
+	struct t3_cq oldcq, newcq;
+	int ret;
+
+	CTR3(KTR_IW_CXGB, "%s ib_cq %p cqe %d", __FUNCTION__, cq, cqe);
+
+	/* We don't downsize... */
+	if (cqe <= cq->cqe)
+		return 0;
+
+	/* create new t3_cq with new size */
+	cqe = roundup_pow_of_two(cqe+1);
+	newcq.size_log2 = ilog2(cqe);
+
+	/* Dont allow resize to less than the current wce count */
+	if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) {
+		return (-ENOMEM);
+	}
+
+	/* Quiesce all QPs using this CQ */
+	ret = iwch_quiesce_qps(chp);
+	if (ret) {
+		return (ret);
+	}
+
+	ret = cxio_create_cq(&chp->rhp->rdev, &newcq);
+	if (ret) {
+		return (ret);
+	}
+
+	/* copy CQEs */
+	memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) *
+				        sizeof(struct t3_cqe));
+
+	/* old iwch_qp gets new t3_cq but keeps old cqid */
+	oldcq = chp->cq;
+	chp->cq = newcq;
+	chp->cq.cqid = oldcq.cqid;
+
+	/* resize new t3_cq to update the HW context */
+	ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq);
+	if (ret) {
+		chp->cq = oldcq;
+		return ret;
+	}
+	chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1;
+
+	/* destroy old t3_cq */
+	oldcq.cqid = newcq.cqid;
+	ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq);
+	if (ret) {
+		log(LOG_ERR, "%s - cxio_destroy_cq failed %d\n",
+			__FUNCTION__, ret);
+	}
+
+	/* add user hooks here */
+
+	/* resume qps */
+	ret = iwch_resume_qps(chp);
+	return ret;
+#else
+	return (-ENOSYS);
+#endif
+}
+
+static int
+iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	enum t3_cq_opcode cq_op;
+	int err;
+	u32 rptr;
+
+	chp = to_iwch_cq(ibcq);
+	rhp = chp->rhp;
+	if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		cq_op = CQ_ARM_SE;
+	else
+		cq_op = CQ_ARM_AN;
+	if (chp->user_rptr_addr) {
+		if (copyin(&rptr, chp->user_rptr_addr, 4))
+			return (-EFAULT);
+		mtx_lock(&chp->lock);
+		chp->cq.rptr = rptr;
+	} else
+		mtx_lock(&chp->lock);
+	CTR2(KTR_IW_CXGB, "%s rptr 0x%x", __FUNCTION__, chp->cq.rptr);
+	err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0);
+	mtx_unlock(&chp->lock);
+	if (err < 0)
+		log(LOG_ERR, "Error %d rearming CQID 0x%x\n", err,
+		       chp->cq.cqid);
+	if (err > 0 && !(flags & IB_CQ_REPORT_MISSED_EVENTS))
+		err = 0;
+	return err;
+}
+
+#ifdef notyet
+static int
+iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+#ifdef notyet	
+	int len = vma->vm_end - vma->vm_start;
+	u32 key = vma->vm_pgoff << PAGE_SHIFT;
+	struct cxio_rdev *rdev_p;
+	int ret = 0;
+	struct iwch_mm_entry *mm;
+	struct iwch_ucontext *ucontext;
+	u64 addr;
+
+	CTR4(KTR_IW_CXGB, "%s pgoff 0x%lx key 0x%x len %d", __FUNCTION__, vma->vm_pgoff,
+	     key, len);
+
+	if (vma->vm_start & (PAGE_SIZE-1)) {
+	        return (-EINVAL);
+	}
+
+	rdev_p = &(to_iwch_dev(context->device)->rdev);
+	ucontext = to_iwch_ucontext(context);
+
+	mm = remove_mmap(ucontext, key, len);
+	if (!mm)
+		return (-EINVAL);
+	addr = mm->addr;
+	cxfree(mm);
+
+	if ((addr >= rdev_p->rnic_info.udbell_physbase) &&
+	    (addr < (rdev_p->rnic_info.udbell_physbase +
+		       rdev_p->rnic_info.udbell_len))) {
+
+		/*
+		 * Map T3 DB register.
+		 */
+		if (vma->vm_flags & VM_READ) {
+			return (-EPERM);
+		}
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		vma->vm_flags &= ~VM_MAYREAD;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+				         len, vma->vm_page_prot);
+	} else {
+
+		/*
+		 * Map WQ or CQ contig dma memory...
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      addr >> PAGE_SHIFT,
+				      len, vma->vm_page_prot);
+	}
+
+	return ret;
+#endif
+	return (0);
+}
+#endif
+
+static int iwch_deallocate_pd(struct ib_pd *pd)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	CTR3(KTR_IW_CXGB, "%s ibpd %p pdid 0x%x", __FUNCTION__, pd, php->pdid);
+	cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
+	cxfree(php);
+	return 0;
+}
+
+static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
+			       struct ib_ucontext *context,
+			       struct ib_udata *udata)
+{
+	struct iwch_pd *php;
+	u32 pdid;
+	struct iwch_dev *rhp;
+
+	CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev);
+	rhp = (struct iwch_dev *) ibdev;
+	pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
+	if (!pdid)
+		return ERR_PTR(-EINVAL);
+	php = malloc(sizeof(*php), M_DEVBUF, M_ZERO|M_NOWAIT);
+	if (!php) {
+		cxio_hal_put_pdid(rhp->rdev.rscp, pdid);
+		return ERR_PTR(-ENOMEM);
+	}
+	php->pdid = pdid;
+	php->rhp = rhp;
+	if (context) {
+		if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) {
+			iwch_deallocate_pd(&php->ibpd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	CTR3(KTR_IW_CXGB, "%s pdid 0x%0x ptr 0x%p", __FUNCTION__, pdid, php);
+	return &php->ibpd;
+}
+
+static int iwch_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mr *mhp;
+	u32 mmid;
+
+	CTR2(KTR_IW_CXGB, "%s ib_mr %p", __FUNCTION__, ib_mr);
+	/* There can be no memory windows */
+	if (atomic_load_acq_int(&ib_mr->usecnt))
+		return (-EINVAL);
+
+	mhp = to_iwch_mr(ib_mr);
+	rhp = mhp->rhp;
+	mmid = mhp->attr.stag >> 8;
+	cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	if (mhp->kva)
+		cxfree((void *) (unsigned long) mhp->kva);
+	if (mhp->umem)
+		ib_umem_release(mhp->umem);
+	CTR3(KTR_IW_CXGB, "%s mmid 0x%x ptr %p", __FUNCTION__, mmid, mhp);
+	cxfree(mhp);
+	return 0;
+}
+
+static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
+					struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					int acc,
+					u64 *iova_start)
+{
+	__be64 *page_list;
+	int shift;
+	u64 total_size;
+	int npages;
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	int ret;
+
+	CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd);
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+
+	mhp = malloc(sizeof(*mhp), M_DEVBUF, M_ZERO|M_NOWAIT);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	/* First check that we have enough alignment */
+	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (num_phys_buf > 1 &&
+	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
+				   &total_size, &npages, &shift, &page_list);
+	if (ret)
+		goto err;
+
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+
+	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = *iova_start;
+	mhp->attr.page_size = shift - 12;
+
+	mhp->attr.len = (u32) total_size;
+	mhp->attr.pbl_size = npages;
+	ret = iwch_register_mem(rhp, php, mhp, shift, page_list);
+	cxfree(page_list);
+	if (ret) {
+		goto err;
+	}
+	return &mhp->ibmr;
+err:
+	cxfree(mhp);
+	return ERR_PTR(-ret);
+
+}
+
+static int iwch_reregister_phys_mem(struct ib_mr *mr,
+				     int mr_rereg_mask,
+				     struct ib_pd *pd,
+	                             struct ib_phys_buf *buffer_list,
+	                             int num_phys_buf,
+	                             int acc, u64 * iova_start)
+{
+
+	struct iwch_mr mh, *mhp;
+	struct iwch_pd *php;
+	struct iwch_dev *rhp;
+	__be64 *page_list = NULL;
+	int shift = 0;
+	u64 total_size;
+	int npages;
+	int ret;
+
+	CTR3(KTR_IW_CXGB, "%s ib_mr %p ib_pd %p", __FUNCTION__, mr, pd);
+
+	/* There can be no memory windows */
+	if (atomic_load_acq_int(&mr->usecnt))
+		return (-EINVAL);
+
+	mhp = to_iwch_mr(mr);
+	rhp = mhp->rhp;
+	php = to_iwch_pd(mr->pd);
+
+	/* make sure we are on the same adapter */
+	if (rhp != php->rhp)
+		return (-EINVAL);
+
+	memcpy(&mh, mhp, sizeof *mhp);
+
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		php = to_iwch_pd(pd);
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		mh.attr.perms = iwch_ib_to_tpt_access(acc);
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		ret = build_phys_page_list(buffer_list, num_phys_buf,
+					   iova_start,
+					   &total_size, &npages,
+					   &shift, &page_list);
+		if (ret)
+			return ret;
+	}
+
+	ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages);
+	cxfree(page_list);
+	if (ret) {
+		return ret;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		mhp->attr.pdid = php->pdid;
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		mhp->attr.zbva = 0;
+		mhp->attr.va_fbo = *iova_start;
+		mhp->attr.page_size = shift - 12;
+		mhp->attr.len = (u32) total_size;
+		mhp->attr.pbl_size = npages;
+	}
+
+	return 0;
+}
+
+
+static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				      u64 virt, int acc, struct ib_udata *udata)
+{
+	__be64 *pages;
+	int shift, i, n;
+	int err = 0;
+	struct ib_umem_chunk *chunk;
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	struct iwch_reg_user_mr_resp uresp;
+#ifdef notyet
+	int j, k, len;
+#endif	
+	
+	CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd);
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = malloc(sizeof(*mhp), M_DEVBUF, M_NOWAIT|M_ZERO);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc);
+	if (IS_ERR(mhp->umem)) {
+		err = PTR_ERR(mhp->umem);
+		cxfree(mhp);
+		return ERR_PTR(-err);
+	}
+
+	shift = ffs(mhp->umem->page_size) - 1;
+
+	n = 0;
+	TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry)
+		n += chunk->nents;
+
+	pages = kmalloc(n * sizeof(u64), M_NOWAIT);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	i = n = 0;
+
+#if 0	
+	TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry)
+		for (j = 0; j < chunk->nmap; ++j) {
+			len = sg_dma_len(&chunk->page_list[j]) >> shift;
+			for (k = 0; k < len; ++k) {
+				pages[i++] = htobe64(sg_dma_address(
+					&chunk->page_list[j]) +
+					mhp->umem->page_size * k);
+			}
+		}
+#endif
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = virt;
+	mhp->attr.page_size = shift - 12;
+	mhp->attr.len = (u32) length;
+	mhp->attr.pbl_size = i;
+	err = iwch_register_mem(rhp, php, mhp, shift, pages);
+	cxfree(pages);
+	if (err)
+		goto err;
+
+	if (udata && !t3a_device(rhp)) {
+		uresp.pbl_addr = (mhp->attr.pbl_addr -
+	                         rhp->rdev.rnic_info.pbl_base) >> 3;
+		CTR2(KTR_IW_CXGB, "%s user resp pbl_addr 0x%x", __FUNCTION__,
+		     uresp.pbl_addr);
+
+		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+			iwch_dereg_mr(&mhp->ibmr);
+			err = EFAULT;
+			goto err;
+		}
+	}
+
+	return &mhp->ibmr;
+
+err:
+	ib_umem_release(mhp->umem);
+	cxfree(mhp);
+	return ERR_PTR(-err);
+}
+
+static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ib_phys_buf bl;
+	u64 kva;
+	struct ib_mr *ibmr;
+
+	CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd);
+
+	/*
+	 * T3 only supports 32 bits of size.
+	 */
+	bl.size = 0xffffffff;
+	bl.addr = 0;
+	kva = 0;
+	ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva);
+	return ibmr;
+}
+
+static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mw *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret;
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = malloc(sizeof(*mhp), M_DEVBUF, M_ZERO|M_NOWAIT);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+	ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid);
+	if (ret) {
+		cxfree(mhp);
+		return ERR_PTR(-ret);
+	}
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = TPT_MW;
+	mhp->attr.stag = stag;
+	mmid = (stag) >> 8;
+	insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+	CTR4(KTR_IW_CXGB, "%s mmid 0x%x mhp %p stag 0x%x", __FUNCTION__, mmid, mhp, stag);
+	return &(mhp->ibmw);
+}
+
+static int iwch_dealloc_mw(struct ib_mw *mw)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mw *mhp;
+	u32 mmid;
+
+	mhp = to_iwch_mw(mw);
+	rhp = mhp->rhp;
+	mmid = (mw->rkey) >> 8;
+	cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	cxfree(mhp);
+	CTR4(KTR_IW_CXGB, "%s ib_mw %p mmid 0x%x ptr %p", __FUNCTION__, mw, mmid, mhp);
+	return 0;
+}
+
+static int iwch_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	struct iwch_qp_attributes attrs;
+	struct iwch_ucontext *ucontext;
+
+	qhp = to_iwch_qp(ib_qp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = IWCH_QP_STATE_ERROR;
+	iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0);
+	mtx_lock(&qhp->lock);
+	if (qhp->ep)
+		msleep(qhp, &qhp->lock, 0, "iwch_destroy_qp1", 0);
+	mtx_unlock(&qhp->lock);
+
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid);
+
+	mtx_lock(&qhp->lock);
+	if (--qhp->refcnt)
+		msleep(qhp, &qhp->lock, 0, "iwch_destroy_qp2", 0);
+	mtx_unlock(&qhp->lock);
+
+	ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context)
+				  : NULL;
+	cxio_destroy_qp(&rhp->rdev, &qhp->wq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+
+	CTR4(KTR_IW_CXGB, "%s ib_qp %p qpid 0x%0x qhp %p", __FUNCTION__,
+	     ib_qp, qhp->wq.qpid, qhp);
+	cxfree(qhp);
+	return 0;
+}
+
+static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	struct iwch_pd *php;
+	struct iwch_cq *schp;
+	struct iwch_cq *rchp;
+	struct iwch_create_qp_resp uresp;
+	int wqsize, sqsize, rqsize;
+	struct iwch_ucontext *ucontext;
+
+	CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd);
+	if (attrs->qp_type != IB_QPT_RC)
+		return ERR_PTR(-EINVAL);
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid);
+	rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid);
+	if (!schp || !rchp)
+		return ERR_PTR(-EINVAL);
+
+	/* The RQT size must be # of entries + 1 rounded up to a power of two */
+	rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr);
+	if (rqsize == attrs->cap.max_recv_wr)
+		rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1);
+
+	/* T3 doesn't support RQT depth < 16 */
+	if (rqsize < 16)
+		rqsize = 16;
+
+	if (rqsize > T3_MAX_RQ_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_inline_data > T3_MAX_INLINE)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * NOTE: The SQ and total WQ sizes don't need to be
+	 * a power of two.  However, all the code assumes
+	 * they are. EG: Q_FREECNT() and friends.
+	 */
+	sqsize = roundup_pow_of_two(attrs->cap.max_send_wr);
+	wqsize = roundup_pow_of_two(rqsize + sqsize);
+	CTR4(KTR_IW_CXGB, "%s wqsize %d sqsize %d rqsize %d", __FUNCTION__,
+	     wqsize, sqsize, rqsize);
+	qhp = malloc(sizeof(*qhp), M_DEVBUF, M_ZERO|M_NOWAIT);
+	if (!qhp)
+		return ERR_PTR(-ENOMEM);
+	qhp->wq.size_log2 = ilog2(wqsize);
+	qhp->wq.rq_size_log2 = ilog2(rqsize);
+	qhp->wq.sq_size_log2 = ilog2(sqsize);
+	ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
+	if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq,
+			   ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) {
+		cxfree(qhp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	attrs->cap.max_recv_wr = rqsize - 1;
+	attrs->cap.max_send_wr = sqsize;
+	attrs->cap.max_inline_data = T3_MAX_INLINE;
+
+	qhp->rhp = rhp;
+	qhp->attr.pd = php->pdid;
+	qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid;
+	qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid;
+	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
+	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
+	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
+	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
+	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
+	qhp->attr.state = IWCH_QP_STATE_IDLE;
+	qhp->attr.next_state = IWCH_QP_STATE_IDLE;
+
+	/*
+	 * XXX - These don't get passed in from the openib user
+	 * at create time.  The CM sets them via a QP modify.
+	 * Need to fix...  I think the CM should
+	 */
+	qhp->attr.enable_rdma_read = 1;
+	qhp->attr.enable_rdma_write = 1;
+	qhp->attr.enable_bind = 1;
+	qhp->attr.max_ord = 1;
+	qhp->attr.max_ird = 1;
+
+	mtx_init(&qhp->lock, "cxgb qp", NULL, MTX_DEF|MTX_DUPOK);
+	qhp->refcnt = 1;
+	insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid);
+
+	if (udata) {
+
+		struct iwch_mm_entry *mm1, *mm2;
+
+		mm1 = kmalloc(sizeof *mm1, M_NOWAIT);
+		if (!mm1) {
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		mm2 = kmalloc(sizeof *mm2, M_NOWAIT);
+		if (!mm2) {
+			cxfree(mm1);
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		uresp.qpid = qhp->wq.qpid;
+		uresp.size_log2 = qhp->wq.size_log2;
+		uresp.sq_size_log2 = qhp->wq.sq_size_log2;
+		uresp.rq_size_log2 = qhp->wq.rq_size_log2;
+		mtx_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.db_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		mtx_unlock(&ucontext->mmap_lock);
+		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+			cxfree(mm1);
+			cxfree(mm2);
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-EFAULT);
+		}
+		mm1->key = uresp.key;
+		mm1->addr = vtophys(qhp->wq.queue);
+		mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr));
+		insert_mmap(ucontext, mm1);
+		mm2->key = uresp.db_key;
+		mm2->addr = qhp->wq.udb & PAGE_MASK;
+		mm2->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm2);
+	}
+	qhp->ibqp.qp_num = qhp->wq.qpid;
+	callout_init(&(qhp->timer), TRUE);
+	CTR6(KTR_IW_CXGB, "sq_num_entries %d, rq_num_entries %d "
+	     "qpid 0x%0x qhp %p dma_addr 0x%llx size %d",
+	     qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
+	     qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr,
+	     1 << qhp->wq.size_log2);
+	return &qhp->ibqp;
+}
+
+static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	enum iwch_qp_attr_mask mask = 0;
+	struct iwch_qp_attributes attrs;
+
+	CTR2(KTR_IW_CXGB, "%s ib_qp %p", __FUNCTION__, ibqp);
+
+	/* iwarp does not support the RTR state */
+	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
+		attr_mask &= ~IB_QP_STATE;
+
+	/* Make sure we still have something left to do */
+	if (!attr_mask)
+		return 0;
+
+	memset(&attrs, 0, sizeof attrs);
+	qhp = to_iwch_qp(ibqp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = iwch_convert_state(attr->qp_state);
+	attrs.enable_rdma_read = (attr->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
+	attrs.enable_rdma_write = (attr->qp_access_flags &
+				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+
+	mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0;
+	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
+			(IWCH_QP_ATTR_ENABLE_RDMA_READ |
+			 IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+			 IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0;
+
+	return iwch_modify_qp(rhp, qhp, mask, &attrs, 0);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp)
+{
+	CTR2(KTR_IW_CXGB, "%s ib_qp %p", __FUNCTION__, qp);
+	mtx_lock(&to_iwch_qp(qp)->lock);
+	to_iwch_qp(qp)->refcnt++;
+	mtx_unlock(&to_iwch_qp(qp)->lock);
+}
+
+void iwch_qp_rem_ref(struct ib_qp *qp)
+{
+	CTR2(KTR_IW_CXGB, "%s ib_qp %p", __FUNCTION__, qp);
+	mtx_lock(&to_iwch_qp(qp)->lock);
+	if (--to_iwch_qp(qp)->refcnt == 0)
+	        wakeup(to_iwch_qp(qp));
+	mtx_unlock(&to_iwch_qp(qp)->lock);
+}
+
+static struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn)
+{
+	CTR3(KTR_IW_CXGB, "%s ib_dev %p qpn 0x%x", __FUNCTION__, dev, qpn);
+	return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn);
+}
+
+
+static int iwch_query_pkey(struct ib_device *ibdev,
+			   u8 port, u16 index, u16 * pkey)
+{
+	CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev);
+	*pkey = 0;
+	return 0;
+}
+
+static int iwch_query_gid(struct ib_device *ibdev, u8 port,
+			  int index, union ib_gid *gid)
+{
+	struct iwch_dev *dev;
+	struct port_info *pi;
+
+	CTR5(KTR_IW_CXGB, "%s ibdev %p, port %d, index %d, gid %p",
+	       __FUNCTION__, ibdev, port, index, gid);
+	dev = to_iwch_dev(ibdev);
+	PANIC_IF(port == 0 || port > 2);
+	pi = ((struct port_info *)dev->rdev.port_info.lldevs[port-1]->if_softc);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), pi->hw_addr, 6);
+	return 0;
+}
+
+static int iwch_query_device(struct ib_device *ibdev,
+			     struct ib_device_attr *props)
+{
+
+	struct iwch_dev *dev;
+	CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev);
+
+	dev = to_iwch_dev(ibdev);
+	memset(props, 0, sizeof *props);
+#ifdef notyet	
+	memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->if_addr.ifa_addr, 6);
+#endif	
+	props->device_cap_flags = dev->device_cap_flags;
+#ifdef notyet
+	props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor;
+	props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device;
+#endif
+	props->max_mr_size = ~0ull;
+	props->max_qp = dev->attr.max_qps;
+	props->max_qp_wr = dev->attr.max_wrs;
+	props->max_sge = dev->attr.max_sge_per_wr;
+	props->max_sge_rd = 1;
+	props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp;
+	props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp;
+	props->max_cq = dev->attr.max_cqs;
+	props->max_cqe = dev->attr.max_cqes_per_cq;
+	props->max_mr = dev->attr.max_mem_regs;
+	props->max_pd = dev->attr.max_pds;
+	props->local_ca_ack_delay = 0;
+
+	return 0;
+}
+
+static int iwch_query_port(struct ib_device *ibdev,
+			   u8 port, struct ib_port_attr *props)
+{
+	CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev);
+	props->max_mtu = IB_MTU_4096;
+	props->lid = 0;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	props->state = IB_PORT_ACTIVE;
+	props->phys_state = 0;
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_SNMP_TUNNEL_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->qkey_viol_cntr = 0;
+	props->active_width = 2;
+	props->active_speed = 2;
+	props->max_msg_sz = -1;
+
+	return 0;
+}
+
+#ifdef notyet
+static ssize_t show_rev(struct class_device *cdev, char *buf)
+{
+	struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
+					    ibdev.class_dev);
+	CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev);
+	return sprintf(buf, "%d\n", dev->rdev.t3cdev_p->type);
+}
+
+static ssize_t show_fw_ver(struct class_device *cdev, char *buf)
+{
+	struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
+					    ibdev.class_dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = dev->rdev.t3cdev_p->lldev;
+
+	CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.fw_version);
+}
+
+static ssize_t show_hca(struct class_device *cdev, char *buf)
+{
+	struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
+					    ibdev.class_dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = dev->rdev.t3cdev_p->lldev;
+
+	CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.driver);
+}
+
+static ssize_t show_board(struct class_device *cdev, char *buf)
+{
+	struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
+					    ibdev.class_dev);
+	CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, dev);
+#ifdef notyet
+	return sprintf(buf, "%x.%x\n", dev->rdev.rnic_info.pdev->vendor,
+		                       dev->rdev.rnic_info.pdev->device);
+#else
+	return sprintf(buf, "%x.%x\n", 0xdead, 0xbeef);	 /* XXX */
+#endif
+}
+
+static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct class_device_attribute *iwch_class_attributes[] = {
+	&class_device_attr_hw_rev,
+	&class_device_attr_fw_ver,
+	&class_device_attr_hca_type,
+	&class_device_attr_board_id
+};
+#endif
+
+int iwch_register_device(struct iwch_dev *dev)
+{
+	int ret;
+#ifdef notyet	
+	int i;
+#endif
+	CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, dev);
+	strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+#ifdef notyet	
+	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+#endif	
+	dev->device_cap_flags =
+	    (IB_DEVICE_ZERO_STAG |
+	     IB_DEVICE_SEND_W_INV | IB_DEVICE_MEM_WINDOW);
+
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC));
+	dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
+	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.dma_device = dev->rdev.rnic_info.pdev;
+	dev->ibdev.query_device = iwch_query_device;
+	dev->ibdev.query_port = iwch_query_port;
+	dev->ibdev.modify_port = iwch_modify_port;
+	dev->ibdev.query_pkey = iwch_query_pkey;
+	dev->ibdev.query_gid = iwch_query_gid;
+	dev->ibdev.alloc_ucontext = iwch_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext;
+#ifdef notyet	
+	dev->ibdev.mmap = iwch_mmap;
+#endif	
+	dev->ibdev.alloc_pd = iwch_allocate_pd;
+	dev->ibdev.dealloc_pd = iwch_deallocate_pd;
+	dev->ibdev.create_ah = iwch_ah_create;
+	dev->ibdev.destroy_ah = iwch_ah_destroy;
+	dev->ibdev.create_qp = iwch_create_qp;
+	dev->ibdev.modify_qp = iwch_ib_modify_qp;
+	dev->ibdev.destroy_qp = iwch_destroy_qp;
+	dev->ibdev.create_cq = iwch_create_cq;
+	dev->ibdev.destroy_cq = iwch_destroy_cq;
+	dev->ibdev.resize_cq = iwch_resize_cq;
+	dev->ibdev.poll_cq = iwch_poll_cq;
+	dev->ibdev.get_dma_mr = iwch_get_dma_mr;
+	dev->ibdev.reg_phys_mr = iwch_register_phys_mem;
+	dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem;
+	dev->ibdev.reg_user_mr = iwch_reg_user_mr;
+	dev->ibdev.dereg_mr = iwch_dereg_mr;
+	dev->ibdev.alloc_mw = iwch_alloc_mw;
+	dev->ibdev.bind_mw = iwch_bind_mw;
+	dev->ibdev.dealloc_mw = iwch_dealloc_mw;
+
+	dev->ibdev.attach_mcast = iwch_multicast_attach;
+	dev->ibdev.detach_mcast = iwch_multicast_detach;
+	dev->ibdev.process_mad = iwch_process_mad;
+
+	dev->ibdev.req_notify_cq = iwch_arm_cq;
+	dev->ibdev.post_send = iwch_post_send;
+	dev->ibdev.post_recv = iwch_post_receive;
+
+
+	dev->ibdev.iwcm =
+	    (struct iw_cm_verbs *) kmalloc(sizeof(struct iw_cm_verbs),
+					   M_NOWAIT);
+	dev->ibdev.iwcm->connect = iwch_connect;
+	dev->ibdev.iwcm->accept = iwch_accept_cr;
+	dev->ibdev.iwcm->reject = iwch_reject_cr;
+	dev->ibdev.iwcm->create_listen = iwch_create_listen;
+	dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen;
+	dev->ibdev.iwcm->add_ref = iwch_qp_add_ref;
+	dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref;
+	dev->ibdev.iwcm->get_qp = iwch_get_qp;
+
+	ret = ib_register_device(&dev->ibdev);
+	if (ret)
+		goto bail1;
+#ifdef notyet
+	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) {
+		ret = class_device_create_file(&dev->ibdev.class_dev,
+					       iwch_class_attributes[i]);
+		if (ret) {
+			goto bail2;
+		}
+	}
+#endif	
+	return 0;
+#ifdef notyet	
+bail2:
+#endif	
+	ib_unregister_device(&dev->ibdev);
+bail1:
+	return ret;
+}
+
+void iwch_unregister_device(struct iwch_dev *dev)
+{
+#ifdef notyet
+	int i;
+
+	CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, dev);
+
+	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i)
+		class_device_remove_file(&dev->ibdev.class_dev,
+					 iwch_class_attributes[i]);
+#endif	
+	ib_unregister_device(&dev->ibdev);
+	return;
+}
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h
new file mode 100644
index 0000000000000..c857ce8e5b8f0
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h
@@ -0,0 +1,362 @@
+/**************************************************************************
+
+Copyright (c) 2007, 2008 Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef __IWCH_PROVIDER_H__
+#define __IWCH_PROVIDER_H__
+
+#include <contrib/rdma/ib_verbs.h>
+
+struct iwch_pd {
+	struct ib_pd ibpd;
+	u32 pdid;
+	struct iwch_dev *rhp;
+};
+
+#ifndef container_of
+#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field)))
+#endif
+static __inline struct iwch_pd *
+to_iwch_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct iwch_pd, ibpd);
+}
+
+struct tpt_attributes {
+	u32 stag;
+	u32 state:1;
+	u32 type:2;
+	u32 rsvd:1;
+	enum tpt_mem_perm perms;
+	u32 remote_invaliate_disable:1;
+	u32 zbva:1;
+	u32 mw_bind_enable:1;
+	u32 page_size:5;
+
+	u32 pdid;
+	u32 qpid;
+	u32 pbl_addr;
+	u32 len;
+	u64 va_fbo;
+	u32 pbl_size;
+};
+
+struct iwch_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct iwch_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+typedef struct iwch_mw iwch_mw_handle;
+
+static __inline struct iwch_mr *
+to_iwch_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct iwch_mr, ibmr);
+}
+
+struct iwch_mw {
+	struct ib_mw ibmw;
+	struct iwch_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static __inline struct iwch_mw *
+to_iwch_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct iwch_mw, ibmw);
+}
+
+struct iwch_cq {
+	struct ib_cq ibcq;
+	struct iwch_dev *rhp;
+	struct t3_cq cq;
+	struct mtx lock;
+	int refcnt;
+	u32 /* __user */ *user_rptr_addr;
+};
+
+static __inline struct iwch_cq *
+to_iwch_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct iwch_cq, ibcq);
+}
+
+enum IWCH_QP_FLAGS {
+	QP_QUIESCED = 0x01
+};
+
+struct iwch_mpa_attributes {
+	u8 recv_marker_enabled;
+	u8 xmit_marker_enabled;	/* iWARP: enable inbound Read Resp. */
+	u8 crc_enabled;
+	u8 version;	/* 0 or 1 */
+};
+
+struct iwch_qp_attributes {
+	u32 scq;
+	u32 rcq;
+	u32 sq_num_entries;
+	u32 rq_num_entries;
+	u32 sq_max_sges;
+	u32 sq_max_sges_rdma_write;
+	u32 rq_max_sges;
+	u32 state;
+	u8 enable_rdma_read;
+	u8 enable_rdma_write;	/* enable inbound Read Resp. */
+	u8 enable_bind;
+	u8 enable_mmid0_fastreg;	/* Enable STAG0 + Fast-register */
+	/*
+	 * Next QP state. If specify the current state, only the
+	 * QP attributes will be modified.
+	 */
+	u32 max_ord;
+	u32 max_ird;
+	u32 pd;	/* IN */
+	u32 next_state;
+	char terminate_buffer[52];
+	u32 terminate_msg_len;
+	u8 is_terminate_local;
+	struct iwch_mpa_attributes mpa_attr;	/* IN-OUT */
+	struct iwch_ep *llp_stream_handle;
+	char *stream_msg_buf;	/* Last stream msg. before Idle -> RTS */
+	u32 stream_msg_buf_len;	/* Only on Idle -> RTS */
+};
+
+struct iwch_qp {
+	struct ib_qp ibqp;
+	struct iwch_dev *rhp;
+	struct iwch_ep *ep;
+	struct iwch_qp_attributes attr;
+	struct t3_wq wq;
+	struct mtx lock;
+	int refcnt;
+	enum IWCH_QP_FLAGS flags;
+	struct callout timer;
+};
+
+static __inline int
+qp_quiesced(struct iwch_qp *qhp)
+{
+	return qhp->flags & QP_QUIESCED;
+}
+
+static __inline struct iwch_qp *
+to_iwch_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct iwch_qp, ibqp);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp);
+void iwch_qp_rem_ref(struct ib_qp *qp);
+
+struct iwch_ucontext {
+	struct ib_ucontext ibucontext;
+	struct cxio_ucontext uctx;
+	u32 key;
+	struct mtx mmap_lock;
+	TAILQ_HEAD( ,iwch_mm_entry) mmaps;
+};
+
+static __inline struct iwch_ucontext *
+to_iwch_ucontext(struct ib_ucontext *c)
+{
+	return container_of(c, struct iwch_ucontext, ibucontext);
+}
+
+struct iwch_mm_entry {
+	TAILQ_ENTRY(iwch_mm_entry) entry;
+	u64 addr;
+	u32 key;
+	unsigned len;
+};
+
+static __inline struct iwch_mm_entry *
+remove_mmap(struct iwch_ucontext *ucontext,
+						u32 key, unsigned len)
+{
+	struct iwch_mm_entry *tmp, *mm;
+
+	mtx_lock(&ucontext->mmap_lock);
+	TAILQ_FOREACH_SAFE(mm, &ucontext->mmaps, entry, tmp) {
+		if (mm->key == key && mm->len == len) {
+			TAILQ_REMOVE(&ucontext->mmaps, mm, entry);
+			mtx_unlock(&ucontext->mmap_lock);
+			CTR4(KTR_IW_CXGB, "%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__,
+			     key, (unsigned long long) mm->addr, mm->len);
+			return mm;
+		}
+	}
+	mtx_unlock(&ucontext->mmap_lock);
+
+	return NULL;
+}
+
+static __inline void
+insert_mmap(struct iwch_ucontext *ucontext,
+			       struct iwch_mm_entry *mm)
+{
+	mtx_lock(&ucontext->mmap_lock);
+	CTR4(KTR_IW_CXGB, "%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__,
+	     mm->key, (unsigned long long) mm->addr, mm->len);
+	TAILQ_INSERT_TAIL(&ucontext->mmaps, mm, entry);
+	mtx_unlock(&ucontext->mmap_lock);
+}
+
+enum iwch_qp_attr_mask {
+	IWCH_QP_ATTR_NEXT_STATE = 1 << 0,
+	IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
+	IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
+	IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
+	IWCH_QP_ATTR_MAX_ORD = 1 << 11,
+	IWCH_QP_ATTR_MAX_IRD = 1 << 12,
+	IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
+	IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
+	IWCH_QP_ATTR_MPA_ATTR = 1 << 24,
+	IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
+	IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ |
+				     IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+				     IWCH_QP_ATTR_MAX_ORD |
+				     IWCH_QP_ATTR_MAX_IRD |
+				     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+				     IWCH_QP_ATTR_STREAM_MSG_BUFFER |
+				     IWCH_QP_ATTR_MPA_ATTR |
+				     IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE)
+};
+
+int iwch_modify_qp(struct iwch_dev *rhp,
+				struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs,
+				int internal);
+
+enum iwch_qp_state {
+	IWCH_QP_STATE_IDLE,
+	IWCH_QP_STATE_RTS,
+	IWCH_QP_STATE_ERROR,
+	IWCH_QP_STATE_TERMINATE,
+	IWCH_QP_STATE_CLOSING,
+	IWCH_QP_STATE_TOT
+};
+
+static __inline int
+iwch_convert_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+	case IB_QPS_INIT:
+		return IWCH_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return IWCH_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return IWCH_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return IWCH_QP_STATE_TERMINATE;
+	case IB_QPS_ERR:
+		return IWCH_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static __inline u32
+iwch_ib_to_tpt_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE ? TPT_LOCAL_WRITE : 0) |
+	       TPT_LOCAL_READ;
+}
+
+static __inline u32
+iwch_ib_to_mwbind_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? T3_MEM_ACCESS_REM_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? T3_MEM_ACCESS_REM_READ : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE ? T3_MEM_ACCESS_LOCAL_WRITE : 0) |
+	       T3_MEM_ACCESS_LOCAL_READ;
+}
+
+enum iwch_mmid_state {
+	IWCH_STAG_STATE_VALID,
+	IWCH_STAG_STATE_INVALID
+};
+
+enum iwch_qp_query_flags {
+	IWCH_QP_QUERY_CONTEXT_NONE = 0x0,	/* No ctx; Only attrs */
+	IWCH_QP_QUERY_CONTEXT_GET = 0x1,	/* Get ctx + attrs */
+	IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2,	/* Not Supported */
+
+	/*
+	 * Quiesce QP context; Consumer
+	 * will NOT replay outstanding WR
+	 */
+	IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4,
+	IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8,
+	IWCH_QP_QUERY_TEST_USERWRITE = 0x32	/* Test special */
+};
+
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+int iwch_bind_mw(struct ib_qp *qp,
+			     struct ib_mw *mw,
+			     struct ib_mw_bind *mw_bind);
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
+int iwch_register_device(struct iwch_dev *dev);
+void iwch_unregister_device(struct iwch_dev *dev);
+int iwch_quiesce_qps(struct iwch_cq *chp);
+int iwch_resume_qps(struct iwch_cq *chp);
+void stop_read_rep_timer(struct iwch_qp *qhp);
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+					struct iwch_mr *mhp,
+					int shift,
+					__be64 *page_list);
+int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+					struct iwch_mr *mhp,
+					int shift,
+					__be64 *page_list,
+					int npages);
+int build_phys_page_list(struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					u64 *iova_start,
+					u64 *total_size,
+					int *npages,
+					int *shift,
+					__be64 **page_list);
+
+
+#define IWCH_NODE_DESC "cxgb3 Chelsio Communications"
+
+#endif
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c
new file mode 100644
index 0000000000000..3c203f1b43375
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c
@@ -0,0 +1,1052 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+
+#include <netinet/in.h>
+
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/ib_umem.h>
+#include <contrib/rdma/ib_user_verbs.h>
+
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#include <ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <ulp/iw_cxgb/iw_cxgb_user.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h>
+#endif
+
+
+#define NO_SUPPORT -1
+
+static int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 * flit_cnt)
+{
+	int i;
+	u32 plen;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.rdmaop = T3_SEND_WITH_SE;
+		else
+			wqe->send.rdmaop = T3_SEND;
+		wqe->send.rem_stag = 0;
+		break;
+#if 0				/* Not currently supported */
+	case TYPE_SEND_INVALIDATE:
+	case TYPE_SEND_INVALIDATE_IMMEDIATE:
+		wqe->send.rdmaop = T3_SEND_WITH_INV;
+		wqe->send.rem_stag = htobe32(wr->wr.rdma.rkey);
+		break;
+	case TYPE_SEND_SE_INVALIDATE:
+		wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
+		wqe->send.rem_stag = htobe32(wr->wr.rdma.rkey);
+		break;
+#endif
+	default:
+		break;
+	}
+	if (wr->num_sge > T3_MAX_SGE)
+		return (-EINVAL);
+	wqe->send.reserved[0] = 0;
+	wqe->send.reserved[1] = 0;
+	wqe->send.reserved[2] = 0;
+	if (wr->opcode == IB_WR_SEND_WITH_IMM) {
+		plen = 4;
+		wqe->send.sgl[0].stag = wr->imm_data;
+		wqe->send.sgl[0].len = 0;
+		wqe->send.num_sgle = 0;
+		*flit_cnt = 5;
+	} else {
+		plen = 0;
+		for (i = 0; i < wr->num_sge; i++) {
+			if ((plen + wr->sg_list[i].length) < plen) {
+				return (-EMSGSIZE);
+			}
+			plen += wr->sg_list[i].length;
+			wqe->send.sgl[i].stag =
+			    htobe32(wr->sg_list[i].lkey);
+			wqe->send.sgl[i].len =
+			    htobe32(wr->sg_list[i].length);
+			wqe->send.sgl[i].to = htobe64(wr->sg_list[i].addr);
+		}
+		wqe->send.num_sgle = htobe32(wr->num_sge);
+		*flit_cnt = 4 + ((wr->num_sge) << 1);
+	}
+	wqe->send.plen = htobe32(plen);
+	return 0;
+}
+
+static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
+				 u8 *flit_cnt)
+{
+	int i;
+	u32 plen;
+	
+	if (wr->num_sge > T3_MAX_SGE)
+		return (-EINVAL);
+	wqe->write.rdmaop = T3_RDMA_WRITE;
+	wqe->write.reserved[0] = 0;
+	wqe->write.reserved[1] = 0;
+	wqe->write.reserved[2] = 0;
+	wqe->write.stag_sink = htobe32(wr->wr.rdma.rkey);
+	wqe->write.to_sink = htobe64(wr->wr.rdma.remote_addr);
+
+	if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+		plen = 4;
+		wqe->write.sgl[0].stag = wr->imm_data;
+		wqe->write.sgl[0].len = 0;
+		wqe->write.num_sgle = 0; 
+		*flit_cnt = 6;
+	} else {
+		plen = 0;
+		for (i = 0; i < wr->num_sge; i++) {
+			if ((plen + wr->sg_list[i].length) < plen) {
+				return (-EMSGSIZE);
+			}
+			plen += wr->sg_list[i].length;
+			wqe->write.sgl[i].stag =
+			    htobe32(wr->sg_list[i].lkey);
+			wqe->write.sgl[i].len =
+			    htobe32(wr->sg_list[i].length);
+			wqe->write.sgl[i].to =
+			    htobe64(wr->sg_list[i].addr);
+		}
+		wqe->write.num_sgle = htobe32(wr->num_sge);
+		*flit_cnt = 5 + ((wr->num_sge) << 1);
+	}
+	wqe->write.plen = htobe32(plen);
+	return 0;
+}
+
+static int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt)
+{
+	if (wr->num_sge > 1)
+		return (-EINVAL);
+	wqe->read.rdmaop = T3_READ_REQ;
+	wqe->read.reserved[0] = 0;
+	wqe->read.reserved[1] = 0;
+	wqe->read.reserved[2] = 0;
+	wqe->read.rem_stag = htobe32(wr->wr.rdma.rkey);
+	wqe->read.rem_to = htobe64(wr->wr.rdma.remote_addr);
+	wqe->read.local_stag = htobe32(wr->sg_list[0].lkey);
+	wqe->read.local_len = htobe32(wr->sg_list[0].length);
+	wqe->read.local_to = htobe64(wr->sg_list[0].addr);
+	*flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
+	return 0;
+}
+
+/*
+ * TBD: this is going to be moved to firmware. Missing pdid/qpid check for now.
+ */
+static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
+			    u32 num_sgle, u32 * pbl_addr, u8 * page_size)
+{
+	int i;
+	struct iwch_mr *mhp;
+	u32 offset;
+	for (i = 0; i < num_sgle; i++) {
+
+		mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8);
+		if (!mhp) {
+			CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__);
+			return (-EIO);
+		}
+		if (!mhp->attr.state) {
+			CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__);
+			return (-EIO);
+		}
+		if (mhp->attr.zbva) {
+			CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__);
+			return (-EIO);
+		}
+
+		if (sg_list[i].addr < mhp->attr.va_fbo) {
+			CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__);
+			return (-EINVAL);
+		}
+		if (sg_list[i].addr + ((u64) sg_list[i].length) <
+		    sg_list[i].addr) {
+			CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__);
+			return (-EINVAL);
+		}
+		if (sg_list[i].addr + ((u64) sg_list[i].length) >
+		    mhp->attr.va_fbo + ((u64) mhp->attr.len)) {
+			CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__);
+			return (-EINVAL);
+		}
+		offset = sg_list[i].addr - mhp->attr.va_fbo;
+		offset += ((u32) mhp->attr.va_fbo) %
+		          (1UL << (12 + mhp->attr.page_size));
+		pbl_addr[i] = ((mhp->attr.pbl_addr -
+			        rhp->rdev.rnic_info.pbl_base) >> 3) +
+			      (offset >> (12 + mhp->attr.page_size));
+		page_size[i] = mhp->attr.page_size;
+	}
+	return 0;
+}
+
+static int iwch_build_rdma_recv(struct iwch_dev *rhp, union t3_wr *wqe,
+				struct ib_recv_wr *wr)
+{
+	int i;
+	if (wr->num_sge > T3_MAX_SGE)
+		return (-EINVAL);
+	wqe->recv.num_sgle = htobe32(wr->num_sge);
+	for (i = 0; i < wr->num_sge; i++) {
+		wqe->recv.sgl[i].stag = htobe32(wr->sg_list[i].lkey);
+		wqe->recv.sgl[i].len = htobe32(wr->sg_list[i].length);
+		wqe->recv.sgl[i].to = htobe64(wr->sg_list[i].addr);
+	}
+	for (; i < T3_MAX_SGE; i++) {
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = 0;
+		wqe->recv.sgl[i].to = 0;
+	}
+	return 0;
+}
+
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	int err = 0;
+	u8 t3_wr_flit_cnt = 0;
+	enum t3_wr_opcode t3_wr_opcode = 0;
+	enum t3_wr_flags t3_wr_flags;
+	struct iwch_qp *qhp;
+	u32 idx;
+	union t3_wr *wqe;
+	u32 num_wrs;
+	struct t3_swsq *sqp;
+
+	qhp = to_iwch_qp(ibqp);
+	mtx_lock(&qhp->lock);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		mtx_unlock(&qhp->lock);
+		return (-EINVAL);
+	}
+	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
+		  qhp->wq.sq_size_log2);
+	if (num_wrs <= 0) {
+		mtx_unlock(&qhp->lock);
+		return (-ENOMEM);
+	}
+	while (wr) {
+		if (num_wrs == 0) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+		wqe = (union t3_wr *) (qhp->wq.queue + idx);
+		t3_wr_flags = 0;
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			t3_wr_flags |= T3_SOLICITED_EVENT_FLAG;
+		if (wr->send_flags & IB_SEND_FENCE)
+			t3_wr_flags |= T3_READ_FENCE_FLAG;
+		if (wr->send_flags & IB_SEND_SIGNALED)
+			t3_wr_flags |= T3_COMPLETION_FLAG;
+		sqp = qhp->wq.sq +
+		      Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
+		switch (wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			t3_wr_opcode = T3_WR_SEND;
+			err = iwch_build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			t3_wr_opcode = T3_WR_WRITE;
+			err = iwch_build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		case IB_WR_RDMA_READ:
+			t3_wr_opcode = T3_WR_READ;
+			t3_wr_flags = 0; /* T3 reads are always signaled */
+			err = iwch_build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
+			if (err)
+				break;
+			sqp->read_len = wqe->read.local_len;
+			if (!qhp->wq.oldest_read)
+				qhp->wq.oldest_read = sqp;
+			break;
+		default:
+			CTR2(KTR_IW_CXGB, "%s post of type=%d TBD!", __FUNCTION__,
+			     wr->opcode);
+			err = -EINVAL;
+		}
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+		wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
+		sqp->wr_id = wr->wr_id;
+		sqp->opcode = wr2opcode(t3_wr_opcode);
+		sqp->sq_wptr = qhp->wq.sq_wptr;
+		sqp->complete = 0;
+		sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED);
+
+		build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags,
+			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+			       0, t3_wr_flit_cnt);
+		CTR5(KTR_IW_CXGB, "%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d",
+		     __FUNCTION__, (unsigned long long) wr->wr_id, idx,
+		     Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2),
+		     sqp->opcode);
+		wr = wr->next;
+		num_wrs--;
+		++(qhp->wq.wptr);
+		++(qhp->wq.sq_wptr);
+	}
+	mtx_unlock(&qhp->lock);
+	ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+	return err;
+}
+
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct iwch_qp *qhp;
+	u32 idx;
+	union t3_wr *wqe;
+	u32 num_wrs;
+
+	qhp = to_iwch_qp(ibqp);
+	mtx_lock(&qhp->lock);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		mtx_unlock(&qhp->lock);
+		return (-EINVAL);
+	}
+	num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr,
+			    qhp->wq.rq_size_log2) - 1;
+	if (!wr) {
+		mtx_unlock(&qhp->lock);
+		return (-EINVAL);
+	}
+	while (wr) {
+		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+		wqe = (union t3_wr *) (qhp->wq.queue + idx);
+		if (num_wrs)
+			err = iwch_build_rdma_recv(qhp->rhp, wqe, wr);
+		else
+			err = -ENOMEM;
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+		qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, qhp->wq.rq_size_log2)] =
+			wr->wr_id;
+		build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG,
+			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+			       0, sizeof(struct t3_receive_wr) >> 3);
+		CTR6(KTR_IW_CXGB, "%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x "
+		     "wqe %p ", __FUNCTION__, (unsigned long long) wr->wr_id,
+		     idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe);
+		++(qhp->wq.rq_wptr);
+		++(qhp->wq.wptr);
+		wr = wr->next;
+		num_wrs--;
+	}
+	mtx_unlock(&qhp->lock);
+	ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+	return err;
+}
+
+int iwch_bind_mw(struct ib_qp *qp,
+			     struct ib_mw *mw,
+			     struct ib_mw_bind *mw_bind)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mw *mhp;
+	struct iwch_qp *qhp;
+	union t3_wr *wqe;
+	u32 pbl_addr;
+	u8 page_size;
+	u32 num_wrs;
+	struct ib_sge sgl;
+	int err=0;
+	enum t3_wr_flags t3_wr_flags;
+	u32 idx;
+	struct t3_swsq *sqp;
+
+	qhp = to_iwch_qp(qp);
+	mhp = to_iwch_mw(mw);
+	rhp = qhp->rhp;
+
+	mtx_lock(&qhp->lock);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		mtx_unlock(&qhp->lock);
+		return (-EINVAL);
+	}
+	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
+			    qhp->wq.sq_size_log2);
+	if ((num_wrs) <= 0) {
+		mtx_unlock(&qhp->lock);
+		return (-ENOMEM);
+	}
+	idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+	CTR4(KTR_IW_CXGB, "%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p", __FUNCTION__, idx,
+	     mw, mw_bind);
+	wqe = (union t3_wr *) (qhp->wq.queue + idx);
+
+	t3_wr_flags = 0;
+	if (mw_bind->send_flags & IB_SEND_SIGNALED)
+		t3_wr_flags = T3_COMPLETION_FLAG;
+
+	sgl.addr = mw_bind->addr;
+	sgl.lkey = mw_bind->mr->lkey;
+	sgl.length = mw_bind->length;
+	wqe->bind.reserved = 0;
+	wqe->bind.type = T3_VA_BASED_TO;
+
+	/* TBD: check perms */
+	wqe->bind.perms = iwch_ib_to_mwbind_access(mw_bind->mw_access_flags);
+	wqe->bind.mr_stag = htobe32(mw_bind->mr->lkey);
+	wqe->bind.mw_stag = htobe32(mw->rkey);
+	wqe->bind.mw_len = htobe32(mw_bind->length);
+	wqe->bind.mw_va = htobe64(mw_bind->addr);
+	err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size);
+	if (err) {
+		mtx_unlock(&qhp->lock);
+	        return (err);
+	}
+	wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
+	sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
+	sqp->wr_id = mw_bind->wr_id;
+	sqp->opcode = T3_BIND_MW;
+	sqp->sq_wptr = qhp->wq.sq_wptr;
+	sqp->complete = 0;
+	sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED);
+	wqe->bind.mr_pbl_addr = htobe32(pbl_addr);
+	wqe->bind.mr_pagesz = page_size;
+	wqe->flit[T3_SQ_COOKIE_FLIT] = mw_bind->wr_id;
+	build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags,
+		       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0,
+			        sizeof(struct t3_bind_mw_wr) >> 3);
+	++(qhp->wq.wptr);
+	++(qhp->wq.sq_wptr);
+	mtx_unlock(&qhp->lock);
+
+	ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+	return err;
+}
+
+static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
+				    u8 *layer_type, u8 *ecode)
+{
+	int status = TPT_ERR_INTERNAL_ERR;
+	int tagged = 0;
+	int opcode = -1;
+	int rqtype = 0;
+	int send_inv = 0;
+
+	if (rsp_msg) {
+		status = CQE_STATUS(rsp_msg->cqe);
+		opcode = CQE_OPCODE(rsp_msg->cqe);
+		rqtype = RQ_TYPE(rsp_msg->cqe);
+		send_inv = (opcode == T3_SEND_WITH_INV) ||
+		           (opcode == T3_SEND_WITH_SE_INV);
+		tagged = (opcode == T3_RDMA_WRITE) ||
+			 (rqtype && (opcode == T3_READ_RESP));
+	}
+
+	switch (status) {
+	case TPT_ERR_STAG:
+		if (send_inv) {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+			*ecode = RDMAP_CANT_INV_STAG;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_INV_STAG;
+		}
+		break;
+	case TPT_ERR_PDID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		if ((opcode == T3_SEND_WITH_INV) ||
+		    (opcode == T3_SEND_WITH_SE_INV))
+			*ecode = RDMAP_CANT_INV_STAG;
+		else
+			*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case TPT_ERR_QPID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case TPT_ERR_ACCESS:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_ACC_VIOL;
+		break;
+	case TPT_ERR_WRAP:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_TO_WRAP;
+		break;
+	case TPT_ERR_BOUND:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_BASE_BOUNDS;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_BASE_BOUNDS;
+		}
+		break;
+	case TPT_ERR_INVALIDATE_SHARED_MR:
+	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_CANT_INV_STAG;
+		break;
+	case TPT_ERR_ECC:
+	case TPT_ERR_ECC_PSTAG:
+	case TPT_ERR_INTERNAL_ERR:
+		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case TPT_ERR_OUT_OF_RQE:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_NOBUF;
+		break;
+	case TPT_ERR_PBL_ADDR_BOUND:
+		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+		*ecode = DDPT_BASE_BOUNDS;
+		break;
+	case TPT_ERR_CRC:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_CRC_ERR;
+		break;
+	case TPT_ERR_MARKER:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_MARKER_ERR;
+		break;
+	case TPT_ERR_PDU_LEN_ERR:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_MSG_TOOBIG;
+		break;
+	case TPT_ERR_DDP_VERSION:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_INV_VERS;
+		} else {
+			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+			*ecode = DDPU_INV_VERS;
+		}
+		break;
+	case TPT_ERR_RDMA_VERSION:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_VERS;
+		break;
+	case TPT_ERR_OPCODE:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_OPCODE;
+		break;
+	case TPT_ERR_DDP_QUEUE_NUM:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_QN;
+		break;
+	case TPT_ERR_MSN:
+	case TPT_ERR_MSN_GAP:
+	case TPT_ERR_MSN_RANGE:
+	case TPT_ERR_IRD_OVERFLOW:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_RANGE;
+		break;
+	case TPT_ERR_TBIT:
+		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case TPT_ERR_MO:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MO;
+		break;
+	default:
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	}
+}
+
+/*
+ * This posts a TERMINATE with layer=RDMA, type=catastrophic.
+ */
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg)
+{
+	union t3_wr *wqe;
+	struct terminate_message *term;
+	struct mbuf *m;
+
+	CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__);
+	m = m_gethdr(MT_DATA, M_NOWAIT);
+	if (!m) {
+		log(LOG_ERR, "%s cannot send TERMINATE!\n", __FUNCTION__);
+		return (-ENOMEM);
+	}
+	wqe = mtod(m, union t3_wr *);
+	m->m_len = m->m_pkthdr.len = 40;
+	memset(wqe, 0, 40);
+	wqe->send.rdmaop = T3_TERMINATE;
+
+	/* immediate data length */
+	wqe->send.plen = htonl(4);
+
+	/* immediate data starts here. */
+	term = (struct terminate_message *)wqe->send.sgl;
+	build_term_codes(rsp_msg, &term->layer_etype, &term->ecode);
+	wqe->send.wrh.op_seop_flags = htobe32(V_FW_RIWR_OP(T3_WR_SEND) |
+		V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG));
+	wqe->send.wrh.gen_tid_len = htobe32(V_FW_RIWR_TID(qhp->ep->hwtid));
+
+	m_set_priority(m, CPL_PRIORITY_DATA);
+	m_set_sgl(m, NULL);
+	m_set_sgllen(m, 0);
+	return cxgb_ofld_send(qhp->rhp->rdev.t3cdev_p, m);
+}
+
+/*
+ * Assumes qhp lock is held.
+ */
+static void __flush_qp(struct iwch_qp *qhp)
+{
+	struct iwch_cq *rchp, *schp;
+	int count;
+
+	rchp = get_chp(qhp->rhp, qhp->attr.rcq);
+	schp = get_chp(qhp->rhp, qhp->attr.scq);
+
+	CTR4(KTR_IW_CXGB, "%s qhp %p rchp %p schp %p", __FUNCTION__, qhp, rchp, schp);
+	/* take a ref on the qhp since we must release the lock */
+	qhp->refcnt++;
+	mtx_unlock(&qhp->lock);
+
+	/* locking heirarchy: cq lock first, then qp lock. */
+	mtx_lock(&rchp->lock);
+	mtx_lock(&qhp->lock);
+	cxio_flush_hw_cq(&rchp->cq);
+	cxio_count_rcqes(&rchp->cq, &qhp->wq, &count);
+	cxio_flush_rq(&qhp->wq, &rchp->cq, count);
+	mtx_unlock(&qhp->lock);
+	mtx_unlock(&rchp->lock);
+ 	(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+
+	/* locking heirarchy: cq lock first, then qp lock. */
+	mtx_lock(&schp->lock);
+	mtx_lock(&qhp->lock);
+	cxio_flush_hw_cq(&schp->cq);
+	cxio_count_scqes(&schp->cq, &qhp->wq, &count);
+	cxio_flush_sq(&qhp->wq, &schp->cq, count);
+	mtx_unlock(&qhp->lock);
+	mtx_unlock(&schp->lock);
+ 	(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
+
+	/* deref */
+	mtx_lock(&qhp->lock);
+	if (--qhp->refcnt == 0)
+		wakeup(qhp);
+}
+
+static void flush_qp(struct iwch_qp *qhp)
+{
+	if (qhp->ibqp.uobject)
+		cxio_set_wq_in_error(&qhp->wq);
+	else
+		__flush_qp(qhp);
+}
+
+
+/*
+ * Return non zero if at least one RECV was pre-posted.
+ */
+static int rqes_posted(struct iwch_qp *qhp)
+{
+	return fw_riwrh_opcode((struct fw_riwrh *)qhp->wq.queue) == T3_WR_RCV;
+}
+
+static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs)
+{
+	struct t3_rdma_init_attr init_attr;
+	int ret;
+
+	init_attr.tid = qhp->ep->hwtid;
+	init_attr.qpid = qhp->wq.qpid;
+	init_attr.pdid = qhp->attr.pd;
+	init_attr.scqid = qhp->attr.scq;
+	init_attr.rcqid = qhp->attr.rcq;
+	init_attr.rq_addr = qhp->wq.rq_addr;
+	init_attr.rq_size = 1 << qhp->wq.rq_size_log2;
+	init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE |
+		qhp->attr.mpa_attr.recv_marker_enabled |
+		(qhp->attr.mpa_attr.xmit_marker_enabled << 1) |
+		(qhp->attr.mpa_attr.crc_enabled << 2);
+
+	/*
+	 * XXX - The IWCM doesn't quite handle getting these
+	 * attrs set before going into RTS.  For now, just turn
+	 * them on always...
+	 */
+#if 0
+	init_attr.qpcaps = qhp->attr.enableRdmaRead |
+		(qhp->attr.enableRdmaWrite << 1) |
+		(qhp->attr.enableBind << 2) |
+		(qhp->attr.enable_stag0_fastreg << 3) |
+		(qhp->attr.enable_stag0_fastreg << 4);
+#else
+	init_attr.qpcaps = 0x1f;
+#endif
+	init_attr.tcp_emss = qhp->ep->emss;
+	init_attr.ord = qhp->attr.max_ord;
+	init_attr.ird = qhp->attr.max_ird;
+	init_attr.qp_dma_addr = qhp->wq.dma_addr;
+	init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
+	init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0;
+	init_attr.irs = qhp->ep->rcv_seq;
+	CTR5(KTR_IW_CXGB, "%s init_attr.rq_addr 0x%x init_attr.rq_size = %d "
+	     "flags 0x%x qpcaps 0x%x", __FUNCTION__,
+	     init_attr.rq_addr, init_attr.rq_size,
+	     init_attr.flags, init_attr.qpcaps);
+	ret = cxio_rdma_init(&rhp->rdev, &init_attr);
+	CTR2(KTR_IW_CXGB, "%s ret %d", __FUNCTION__, ret);
+	return ret;
+}
+
+int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs,
+				int internal)
+{
+	int ret = 0;
+	struct iwch_qp_attributes newattr = qhp->attr;
+	int disconnect = 0;
+	int terminate = 0;
+	int abort = 0;
+	int free = 0;
+	struct iwch_ep *ep = NULL;
+
+	CTR6(KTR_IW_CXGB, "%s qhp %p qpid 0x%x ep %p state %d -> %d", __FUNCTION__,
+	     qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state,
+	     (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
+
+	mtx_lock(&qhp->lock);
+
+	/* Process attr changes if in IDLE */
+	if (mask & IWCH_QP_ATTR_VALID_MODIFY) {
+		if (qhp->attr.state != IWCH_QP_STATE_IDLE) {
+			ret = -EIO;
+			goto out;
+		}
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ)
+			newattr.enable_rdma_read = attrs->enable_rdma_read;
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE)
+			newattr.enable_rdma_write = attrs->enable_rdma_write;
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND)
+			newattr.enable_bind = attrs->enable_bind;
+		if (mask & IWCH_QP_ATTR_MAX_ORD) {
+			if (attrs->max_ord >
+			    rhp->attr.max_rdma_read_qp_depth) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ord = attrs->max_ord;
+		}
+		if (mask & IWCH_QP_ATTR_MAX_IRD) {
+			if (attrs->max_ird >
+			    rhp->attr.max_rdma_reads_per_qp) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ird = attrs->max_ird;
+		}
+		qhp->attr = newattr;
+	}
+
+	if (!(mask & IWCH_QP_ATTR_NEXT_STATE))
+		goto out;
+	if (qhp->attr.state == attrs->next_state)
+		goto out;
+
+	switch (qhp->attr.state) {
+	case IWCH_QP_STATE_IDLE:
+		switch (attrs->next_state) {
+		case IWCH_QP_STATE_RTS:
+			if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			qhp->attr.mpa_attr = attrs->mpa_attr;
+			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
+			qhp->ep = qhp->attr.llp_stream_handle;
+			qhp->attr.state = IWCH_QP_STATE_RTS;
+
+			/*
+			 * Ref the endpoint here and deref when we
+			 * disassociate the endpoint from the QP.  This
+			 * happens in CLOSING->IDLE transition or *->ERROR
+			 * transition.
+			 */
+			get_ep(&qhp->ep->com);
+			mtx_unlock(&qhp->lock);
+			ret = rdma_init(rhp, qhp, mask, attrs);
+			mtx_lock(&qhp->lock);
+			if (ret)
+				goto err;
+			break;
+		case IWCH_QP_STATE_ERROR:
+			qhp->attr.state = IWCH_QP_STATE_ERROR;
+			flush_qp(qhp);
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case IWCH_QP_STATE_RTS:
+		switch (attrs->next_state) {
+		case IWCH_QP_STATE_CLOSING:
+			PANIC_IF(atomic_load_acq_int(&qhp->ep->com.refcount) < 2);
+			qhp->attr.state = IWCH_QP_STATE_CLOSING;
+			if (!internal) {
+				abort=0;
+				disconnect = 1;
+				ep = qhp->ep;
+			}
+			flush_qp(qhp);
+			break;
+		case IWCH_QP_STATE_TERMINATE:
+			qhp->attr.state = IWCH_QP_STATE_TERMINATE;
+			if (qhp->ibqp.uobject)
+				cxio_set_wq_in_error(&qhp->wq);
+			if (!internal)
+				terminate = 1;
+			break;
+		case IWCH_QP_STATE_ERROR:
+			qhp->attr.state = IWCH_QP_STATE_ERROR;
+			if (!internal) {
+				abort=1;
+				disconnect = 1;
+				ep = qhp->ep;
+			}
+			goto err;
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case IWCH_QP_STATE_CLOSING:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		switch (attrs->next_state) {
+			case IWCH_QP_STATE_IDLE:
+				qhp->attr.state = IWCH_QP_STATE_IDLE;
+				qhp->attr.llp_stream_handle = NULL;
+				put_ep(&qhp->ep->com);
+				qhp->ep = NULL;
+				wakeup(qhp);
+				break;
+			case IWCH_QP_STATE_ERROR:
+				disconnect=1;
+				goto err;
+			default:
+				ret = -EINVAL;
+				goto err;
+		}
+		break;
+	case IWCH_QP_STATE_ERROR:
+		if (attrs->next_state != IWCH_QP_STATE_IDLE) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) ||
+		    !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		qhp->attr.state = IWCH_QP_STATE_IDLE;
+		memset(&qhp->attr, 0, sizeof(qhp->attr));
+		break;
+	case IWCH_QP_STATE_TERMINATE:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		goto err;
+		break;
+	default:
+		log(LOG_ERR, "%s in a bad state %d\n",
+		       __FUNCTION__, qhp->attr.state);
+		ret = -EINVAL;
+		goto err;
+		break;
+	}
+	goto out;
+err:
+	CTR3(KTR_IW_CXGB, "%s disassociating ep %p qpid 0x%x", __FUNCTION__, qhp->ep,
+	     qhp->wq.qpid);
+
+	/* disassociate the LLP connection */
+	qhp->attr.llp_stream_handle = NULL;
+	ep = qhp->ep;
+	qhp->ep = NULL;
+	qhp->attr.state = IWCH_QP_STATE_ERROR;
+	free=1;
+	wakeup(qhp);
+	PANIC_IF(!ep);
+	flush_qp(qhp);
+out:
+	mtx_unlock(&qhp->lock);
+
+	if (terminate)
+		iwch_post_terminate(qhp, NULL);
+
+	/*
+	 * If disconnect is 1, then we need to initiate a disconnect
+	 * on the EP.  This can be a normal close (RTS->CLOSING) or
+	 * an abnormal close (RTS/CLOSING->ERROR).
+	 */
+	if (disconnect)
+		iwch_ep_disconnect(ep, abort, M_NOWAIT);
+
+	/*
+	 * If free is 1, then we've disassociated the EP from the QP
+	 * and we need to dereference the EP.
+	 */
+	if (free)
+		put_ep(&ep->com);
+
+	CTR2(KTR_IW_CXGB, "%s exit state %d", __FUNCTION__, qhp->attr.state);
+	return ret;
+}
+
+static int quiesce_qp(struct iwch_qp *qhp)
+{
+	mtx_lock(&qhp->lock);
+	iwch_quiesce_tid(qhp->ep);
+	qhp->flags |= QP_QUIESCED;
+	mtx_unlock(&qhp->lock);
+	return 0;
+}
+
+static int resume_qp(struct iwch_qp *qhp)
+{
+	mtx_lock(&qhp->lock);
+	iwch_resume_tid(qhp->ep);
+	qhp->flags &= ~QP_QUIESCED;
+	mtx_lock(&qhp->lock);
+	return 0;
+}
+
+int iwch_quiesce_qps(struct iwch_cq *chp)
+{
+	int i;
+	struct iwch_qp *qhp;
+
+	for (i=0; i < T3_MAX_NUM_QP; i++) {
+		qhp = get_qhp(chp->rhp, i);
+		if (!qhp)
+			continue;
+		if ((qhp->attr.rcq == chp->cq.cqid) && !qp_quiesced(qhp)) {
+			quiesce_qp(qhp);
+			continue;
+		}
+		if ((qhp->attr.scq == chp->cq.cqid) && !qp_quiesced(qhp))
+			quiesce_qp(qhp);
+	}
+	return 0;
+}
+
+int iwch_resume_qps(struct iwch_cq *chp)
+{
+	int i;
+	struct iwch_qp *qhp;
+
+	for (i=0; i < T3_MAX_NUM_QP; i++) {
+		qhp = get_qhp(chp->rhp, i);
+		if (!qhp)
+			continue;
+		if ((qhp->attr.rcq == chp->cq.cqid) && qp_quiesced(qhp)) {
+			resume_qp(qhp);
+			continue;
+		}
+		if ((qhp->attr.scq == chp->cq.cqid) && qp_quiesced(qhp))
+			resume_qp(qhp);
+	}
+	return 0;
+}
+
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c
new file mode 100644
index 0000000000000..9d3618fbd57f0
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c
@@ -0,0 +1,382 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/pciio.h>
+#include <sys/conf.h>
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus_dma.h>
+#include <sys/rman.h>
+#include <sys/ioccom.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/libkern.h>
+
+#include <netinet/in.h>
+
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/ib_umem.h>
+#include <contrib/rdma/ib_user_verbs.h>
+
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#include <ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <ulp/iw_cxgb/iw_cxgb.h>
+#include <ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <ulp/iw_cxgb/iw_cxgb_user.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h>
+#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h>
+#endif
+
+#ifdef needed
+static struct buf_ring *rhdl_fifo;
+static struct mtx rhdl_fifo_lock;
+#endif
+
+#define RANDOM_SIZE 16
+
+static int __cxio_init_resource_fifo(struct buf_ring **fifo,
+				   struct mtx *fifo_lock,
+				   u32 nr, u32 skip_low,
+				   u32 skip_high,
+				   int randomize)
+{
+	u32 i, j, idx;
+	u32 random_bytes;
+	u32 rarray[16];
+	mtx_init(fifo_lock, "cxio fifo", NULL, MTX_DEF|MTX_DUPOK);
+
+	*fifo = buf_ring_alloc(nr, M_NOWAIT);
+	if (*fifo == NULL)
+		return (-ENOMEM);
+#if 0
+	for (i = 0; i < skip_low + skip_high; i++) {
+		u32 entry = 0;
+		
+		buf_ring_enqueue(*fifo, (uintptr_t) entry);
+	}
+#endif	
+	if (randomize) {
+		j = 0;
+		random_bytes = random();
+		for (i = 0; i < RANDOM_SIZE; i++)
+			rarray[i] = i + skip_low;
+		for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
+			if (j >= RANDOM_SIZE) {
+				j = 0;
+				random_bytes = random();
+			}
+			idx = (random_bytes >> (j * 2)) & 0xF;
+			buf_ring_enqueue(*fifo, (void *)(uintptr_t)rarray[idx]);
+			rarray[idx] = i;
+			j++;
+		}
+		for (i = 0; i < RANDOM_SIZE; i++)
+			buf_ring_enqueue(*fifo, (void *) (uintptr_t)rarray[i]);
+	} else
+		for (i = skip_low; i < nr - skip_high; i++)
+			buf_ring_enqueue(*fifo, (void *) (uintptr_t)i);
+#if 0
+	for (i = 0; i < skip_low + skip_high; i++)
+		buf_ring_dequeue(*fifo);
+#endif	
+	return 0;
+}
+
+static int cxio_init_resource_fifo(struct buf_ring **fifo, struct mtx * fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 0));
+}
+
+static int cxio_init_resource_fifo_random(struct buf_ring **fifo,
+				  struct mtx * fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+
+	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 1));
+}
+
+static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p)
+{
+	u32 i;
+
+	mtx_init(&rdev_p->rscp->qpid_fifo_lock, "qpid fifo", NULL, MTX_DEF);
+
+	rdev_p->rscp->qpid_fifo = buf_ring_alloc(T3_MAX_NUM_QP, M_NOWAIT);
+	if (rdev_p->rscp->qpid_fifo == NULL)
+		return (-ENOMEM);
+
+	for (i = 16; i < T3_MAX_NUM_QP; i++)
+		if (!(i & rdev_p->qpmask))
+			buf_ring_enqueue(rdev_p->rscp->qpid_fifo, (void *) (uintptr_t)i);
+	return 0;
+}
+
+#ifdef needed
+int cxio_hal_init_rhdl_resource(u32 nr_rhdl)
+{
+	return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1,
+				       0);
+}
+
+void cxio_hal_destroy_rhdl_resource(void)
+{
+	buf_ring_free(rhdl_fifo);
+}
+#endif
+
+/* nr_* must be power of 2 */
+int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+			   u32 nr_tpt, u32 nr_pbl,
+			   u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid)
+{
+	int err = 0;
+	struct cxio_hal_resource *rscp;
+
+	rscp = malloc(sizeof(*rscp), M_DEVBUF, M_NOWAIT|M_ZERO);
+	if (!rscp)
+		return (-ENOMEM);
+	rdev_p->rscp = rscp;
+	err = cxio_init_resource_fifo_random(&rscp->tpt_fifo,
+				      &rscp->tpt_fifo_lock,
+				      nr_tpt, 1, 0);
+	if (err)
+		goto tpt_err;
+	err = cxio_init_qpid_fifo(rdev_p);
+	if (err)
+		goto qpid_err;
+	err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock,
+				      nr_cqid, 1, 0);
+	if (err)
+		goto cqid_err;
+	err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock,
+				      nr_pdid, 1, 0);
+	if (err)
+		goto pdid_err;
+	return 0;
+pdid_err:
+	buf_ring_free(rscp->cqid_fifo);
+cqid_err:
+	buf_ring_free(rscp->qpid_fifo);
+qpid_err:
+	buf_ring_free(rscp->tpt_fifo);
+tpt_err:
+	return (-ENOMEM);
+}
+
+/*
+ * returns 0 if no resource available
+ */
+static u32 cxio_hal_get_resource(struct buf_ring *fifo, struct mtx *lock)
+{
+	u32 entry;
+	
+	mtx_lock(lock);
+	entry = (u32)(uintptr_t)buf_ring_dequeue(fifo);
+	mtx_unlock(lock);
+	return entry;
+}
+
+static void cxio_hal_put_resource(struct buf_ring *fifo, u32 entry, struct mtx *lock)
+{
+	mtx_lock(lock);
+	buf_ring_enqueue(fifo, (void *) (uintptr_t)entry);
+	mtx_unlock(lock);
+}
+
+u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(rscp->tpt_fifo, &rscp->tpt_fifo_lock);
+}
+
+void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag)
+{
+	cxio_hal_put_resource(rscp->tpt_fifo, stag, &rscp->tpt_fifo_lock);
+}
+
+u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp)
+{
+	u32 qpid = cxio_hal_get_resource(rscp->qpid_fifo, &rscp->qpid_fifo_lock);
+	CTR2(KTR_IW_CXGB, "%s qpid 0x%x", __FUNCTION__, qpid);
+	return qpid;
+}
+
+void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid)
+{
+	CTR2(KTR_IW_CXGB, "%s qpid 0x%x", __FUNCTION__, qpid);
+	cxio_hal_put_resource(rscp->qpid_fifo, qpid, &rscp->qpid_fifo_lock);
+}
+
+u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(rscp->cqid_fifo, &rscp->cqid_fifo_lock);
+}
+
+void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid)
+{
+	cxio_hal_put_resource(rscp->cqid_fifo, cqid, &rscp->cqid_fifo_lock);
+}
+
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(rscp->pdid_fifo, &rscp->pdid_fifo_lock);
+}
+
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid)
+{
+	cxio_hal_put_resource(rscp->pdid_fifo, pdid, &rscp->pdid_fifo_lock);
+}
+
+void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp)
+{
+	buf_ring_free(rscp->tpt_fifo);
+	buf_ring_free(rscp->cqid_fifo);
+	buf_ring_free(rscp->qpid_fifo);
+	buf_ring_free(rscp->pdid_fifo);
+	free(rscp, M_DEVBUF);
+}
+
+/*
+ * PBL Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
+#define PBL_CHUNK 2*1024*1024
+
+u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size);
+	CTR3(KTR_IW_CXGB, "%s addr 0x%x size %d", __FUNCTION__, (u32)addr, size);
+	return (u32)addr;
+}
+
+void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+	CTR3(KTR_IW_CXGB, "%s addr 0x%x size %d", __FUNCTION__, addr, size);
+	gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size);
+}
+
+int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p)
+{
+
+	rdev_p->pbl_pool = gen_pool_create(rdev_p->rnic_info.pbl_base, MIN_PBL_SHIFT,
+	    rdev_p->rnic_info.pbl_top - rdev_p->rnic_info.pbl_base);
+#if 0	
+	if (rdev_p->pbl_pool) {
+		
+		unsigned long i;
+		for (i = rdev_p->rnic_info.pbl_base;
+		     i <= rdev_p->rnic_info.pbl_top - PBL_CHUNK + 1;
+		     i += PBL_CHUNK)
+			gen_pool_add(rdev_p->pbl_pool, i, PBL_CHUNK, -1);
+	}
+#endif	
+	return rdev_p->pbl_pool ? 0 : (-ENOMEM);
+}
+
+void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p)
+{
+	gen_pool_destroy(rdev_p->pbl_pool);
+}
+
+/*
+ * RQT Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_RQT_SHIFT 10	/* 1KB == mini RQT size (16 entries) */
+#define RQT_CHUNK 2*1024*1024
+
+u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6);
+	CTR3(KTR_IW_CXGB, "%s addr 0x%x size %d", __FUNCTION__, (u32)addr, size << 6);
+	return (u32)addr;
+}
+
+void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+	CTR3(KTR_IW_CXGB, "%s addr 0x%x size %d", __FUNCTION__, addr, size << 6);
+	gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6);
+}
+
+int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p)
+{
+	
+	rdev_p->rqt_pool = gen_pool_create(rdev_p->rnic_info.rqt_base,
+	    MIN_RQT_SHIFT, rdev_p->rnic_info.rqt_top - rdev_p->rnic_info.rqt_base);
+#if 0
+	if (rdev_p->rqt_pool) {
+		unsigned long i;
+
+		for (i = rdev_p->rnic_info.rqt_base;
+		     i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1;
+		     i += RQT_CHUNK)
+			gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1);
+	}
+#endif	
+	return rdev_p->rqt_pool ? 0 : (-ENOMEM);
+}
+
+void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p)
+{
+	gen_pool_destroy(rdev_p->rqt_pool);
+}
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h
new file mode 100644
index 0000000000000..e0282a3453028
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h
@@ -0,0 +1,59 @@
+/**************************************************************************
+
+Copyright (c) 2007, 2008 Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef __CXIO_RESOURCE_H__
+#define __CXIO_RESOURCE_H__
+
+extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl);
+extern void cxio_hal_destroy_rhdl_resource(void);
+extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+				  u32 nr_tpt, u32 nr_pbl,
+				  u32 nr_rqt, u32 nr_qpid, u32 nr_cqid,
+				  u32 nr_pdid);
+extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag);
+extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid);
+extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid);
+extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp);
+
+#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base )
+extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+
+#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base )
+extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+#endif
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h
new file mode 100644
index 0000000000000..3086a6340a115
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2007, 2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+#ifndef __IWCH_USER_H__
+#define __IWCH_USER_H__
+
+#define IWCH_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in uint64_t
+ * instead.
+ */
+struct iwch_create_cq_req {
+	uint64_t user_rptr_addr;
+};
+
+struct iwch_create_cq_resp {
+	uint64_t key;
+	uint32_t cqid;
+	uint32_t size_log2;
+};
+
+struct iwch_create_qp_resp {
+	uint64_t key;
+	uint64_t db_key;
+	uint32_t qpid;
+	uint32_t size_log2;
+	uint32_t sq_size_log2;
+	uint32_t rq_size_log2;
+};
+
+struct iwch_reg_user_mr_resp {
+	uint32_t pbl_addr;
+};
+#endif
diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h
new file mode 100644
index 0000000000000..bf8f2d609e041
--- /dev/null
+++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h
@@ -0,0 +1,684 @@
+/**************************************************************************
+
+Copyright (c) 2007, 2008 Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef __CXIO_WR_H__
+#define __CXIO_WR_H__
+#define T3_MAX_SGE      4
+#define T3_MAX_INLINE	64
+
+#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr))
+#define Q_FULL(rptr,wptr,size_log2)  ( (((wptr)-(rptr))>>(size_log2)) && \
+				       ((rptr)!=(wptr)) )
+#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1))
+#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<<size_log2)-((wptr)-(rptr)))
+#define Q_COUNT(rptr,wptr) ((wptr)-(rptr))
+#define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1))
+
+static __inline void
+ring_doorbell(void /* __iomem  */ *doorbell, u32 qpid)
+{
+	writel(doorbell, ((1<<31) | qpid));
+}
+
+#define SEQ32_GE(x,y) (!( (((u32) (x)) - ((u32) (y))) & 0x80000000 ))
+
+enum t3_wr_flags {
+	T3_COMPLETION_FLAG = 0x01,
+	T3_NOTIFY_FLAG = 0x02,
+	T3_SOLICITED_EVENT_FLAG = 0x04,
+	T3_READ_FENCE_FLAG = 0x08,
+	T3_LOCAL_FENCE_FLAG = 0x10
+} __attribute__ ((packed));
+
+enum t3_wr_opcode {
+	T3_WR_BP = FW_WROPCODE_RI_BYPASS,
+	T3_WR_SEND = FW_WROPCODE_RI_SEND,
+	T3_WR_WRITE = FW_WROPCODE_RI_RDMA_WRITE,
+	T3_WR_READ = FW_WROPCODE_RI_RDMA_READ,
+	T3_WR_INV_STAG = FW_WROPCODE_RI_LOCAL_INV,
+	T3_WR_BIND = FW_WROPCODE_RI_BIND_MW,
+	T3_WR_RCV = FW_WROPCODE_RI_RECEIVE,
+	T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
+	T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP
+} __attribute__ ((packed));
+
+enum t3_rdma_opcode {
+	T3_RDMA_WRITE,		/* IETF RDMAP v1.0 ... */
+	T3_READ_REQ,
+	T3_READ_RESP,
+	T3_SEND,
+	T3_SEND_WITH_INV,
+	T3_SEND_WITH_SE,
+	T3_SEND_WITH_SE_INV,
+	T3_TERMINATE,
+	T3_RDMA_INIT,		/* CHELSIO RI specific ... */
+	T3_BIND_MW,
+	T3_FAST_REGISTER,
+	T3_LOCAL_INV,
+	T3_QP_MOD,
+	T3_BYPASS
+} __attribute__ ((packed));
+
+static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
+{
+	switch (wrop) {
+		case T3_WR_BP: return T3_BYPASS;
+		case T3_WR_SEND: return T3_SEND;
+		case T3_WR_WRITE: return T3_RDMA_WRITE;
+		case T3_WR_READ: return T3_READ_REQ;
+		case T3_WR_INV_STAG: return T3_LOCAL_INV;
+		case T3_WR_BIND: return T3_BIND_MW;
+		case T3_WR_INIT: return T3_RDMA_INIT;
+		case T3_WR_QP_MOD: return T3_QP_MOD;
+		default: break;
+	}
+	return -1;
+}
+
+
+/* Work request id */
+union t3_wrid {
+	struct {
+		u32 hi;
+		u32 low;
+	} id0;
+	u64 id1;
+};
+
+#define WRID(wrid)		(wrid.id1)
+#define WRID_GEN(wrid)		(wrid.id0.wr_gen)
+#define WRID_IDX(wrid)		(wrid.id0.wr_idx)
+#define WRID_LO(wrid)		(wrid.id0.wr_lo)
+
+struct fw_riwrh {
+	__be32 op_seop_flags;
+	__be32 gen_tid_len;
+};
+
+#define S_FW_RIWR_OP		24
+#define M_FW_RIWR_OP		0xff
+#define V_FW_RIWR_OP(x)		((x) << S_FW_RIWR_OP)
+#define G_FW_RIWR_OP(x)	((((x) >> S_FW_RIWR_OP)) & M_FW_RIWR_OP)
+
+#define S_FW_RIWR_SOPEOP	22
+#define M_FW_RIWR_SOPEOP	0x3
+#define V_FW_RIWR_SOPEOP(x)	((x) << S_FW_RIWR_SOPEOP)
+
+#define S_FW_RIWR_FLAGS		8
+#define M_FW_RIWR_FLAGS		0x3fffff
+#define V_FW_RIWR_FLAGS(x)	((x) << S_FW_RIWR_FLAGS)
+#define G_FW_RIWR_FLAGS(x)	((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS)
+
+#define S_FW_RIWR_TID		8
+#define V_FW_RIWR_TID(x)	((x) << S_FW_RIWR_TID)
+
+#define S_FW_RIWR_LEN		0
+#define V_FW_RIWR_LEN(x)	((x) << S_FW_RIWR_LEN)
+
+#define S_FW_RIWR_GEN           31
+#define V_FW_RIWR_GEN(x)        ((x)  << S_FW_RIWR_GEN)
+
+struct t3_sge {
+	__be32 stag;
+	__be32 len;
+	__be64 to;
+};
+
+/* If num_sgle is zero, flit 5+ contains immediate data.*/
+struct t3_send_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+
+	u8 rdmaop;		/* 2 */
+	u8 reserved[3];
+	__be32 rem_stag;
+	__be32 plen;		/* 3 */
+	__be32 num_sgle;
+	struct t3_sge sgl[T3_MAX_SGE];	/* 4+ */
+};
+
+struct t3_local_inv_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 stag;		/* 2 */
+	__be32 reserved3;
+};
+
+struct t3_rdma_write_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 rdmaop;		/* 2 */
+	u8 reserved[3];
+	__be32 stag_sink;
+	__be64 to_sink;		/* 3 */
+	__be32 plen;		/* 4 */
+	__be32 num_sgle;
+	struct t3_sge sgl[T3_MAX_SGE];	/* 5+ */
+};
+
+struct t3_rdma_read_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 rdmaop;		/* 2 */
+	u8 reserved[3];
+	__be32 rem_stag;
+	__be64 rem_to;		/* 3 */
+	__be32 local_stag;	/* 4 */
+	__be32 local_len;
+	__be64 local_to;	/* 5 */
+};
+
+enum t3_addr_type {
+	T3_VA_BASED_TO = 0x0,
+	T3_ZERO_BASED_TO = 0x1
+} __attribute__ ((packed));
+
+enum t3_mem_perms {
+	T3_MEM_ACCESS_LOCAL_READ = 0x1,
+	T3_MEM_ACCESS_LOCAL_WRITE = 0x2,
+	T3_MEM_ACCESS_REM_READ = 0x4,
+	T3_MEM_ACCESS_REM_WRITE = 0x8
+} __attribute__ ((packed));
+
+struct t3_bind_mw_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u16 reserved;		/* 2 */
+	u8 type;
+	u8 perms;
+	__be32 mr_stag;
+	__be32 mw_stag;		/* 3 */
+	__be32 mw_len;
+	__be64 mw_va;		/* 4 */
+	__be32 mr_pbl_addr;	/* 5 */
+	u8 reserved2[3];
+	u8 mr_pagesz;
+};
+
+struct t3_receive_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 pagesz[T3_MAX_SGE];
+	__be32 num_sgle;		/* 2 */
+	struct t3_sge sgl[T3_MAX_SGE];	/* 3+ */
+	__be32 pbl_addr[T3_MAX_SGE];
+};
+
+struct t3_bypass_wr {
+	struct fw_riwrh wrh;
+	union t3_wrid wrid;	/* 1 */
+};
+
+struct t3_modify_qp_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 flags;		/* 2 */
+	__be32 quiesce;		/* 2 */
+	__be32 max_ird;		/* 3 */
+	__be32 max_ord;		/* 3 */
+	__be64 sge_cmd;		/* 4 */
+	__be64 ctx1;		/* 5 */
+	__be64 ctx0;		/* 6 */
+};
+
+enum t3_modify_qp_flags {
+	MODQP_QUIESCE  = 0x01,
+	MODQP_MAX_IRD  = 0x02,
+	MODQP_MAX_ORD  = 0x04,
+	MODQP_WRITE_EC = 0x08,
+	MODQP_READ_EC  = 0x10,
+};
+
+
+enum t3_mpa_attrs {
+	uP_RI_MPA_RX_MARKER_ENABLE = 0x1,
+	uP_RI_MPA_TX_MARKER_ENABLE = 0x2,
+	uP_RI_MPA_CRC_ENABLE = 0x4,
+	uP_RI_MPA_IETF_ENABLE = 0x8
+} __attribute__ ((packed));
+
+enum t3_qp_caps {
+	uP_RI_QP_RDMA_READ_ENABLE = 0x01,
+	uP_RI_QP_RDMA_WRITE_ENABLE = 0x02,
+	uP_RI_QP_BIND_ENABLE = 0x04,
+	uP_RI_QP_FAST_REGISTER_ENABLE = 0x08,
+	uP_RI_QP_STAG0_ENABLE = 0x10
+} __attribute__ ((packed));
+
+struct t3_rdma_init_attr {
+	u32 tid;
+	u32 qpid;
+	u32 pdid;
+	u32 scqid;
+	u32 rcqid;
+	u32 rq_addr;
+	u32 rq_size;
+	enum t3_mpa_attrs mpaattrs;
+	enum t3_qp_caps qpcaps;
+	u16 tcp_emss;
+	u32 ord;
+	u32 ird;
+	u64 qp_dma_addr;
+	u32 qp_dma_size;
+	u32 flags;
+	u32 irs;
+};
+
+struct t3_rdma_init_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 qpid;		/* 2 */
+	__be32 pdid;
+	__be32 scqid;		/* 3 */
+	__be32 rcqid;
+	__be32 rq_addr;		/* 4 */
+	__be32 rq_size;
+	u8 mpaattrs;		/* 5 */
+	u8 qpcaps;
+	__be16 ulpdu_size;
+	__be32 flags;		/* bits 31-1 - reservered */
+				/* bit     0 - set if RECV posted */
+	__be32 ord;		/* 6 */
+	__be32 ird;
+	__be64 qp_dma_addr;	/* 7 */
+	__be32 qp_dma_size;	/* 8 */
+	u32 irs;
+};
+
+struct t3_genbit {
+	u64 flit[15];
+	__be64 genbit;
+};
+
+enum rdma_init_wr_flags {
+	RECVS_POSTED = 1,
+};
+
+union t3_wr {
+	struct t3_send_wr send;
+	struct t3_rdma_write_wr write;
+	struct t3_rdma_read_wr read;
+	struct t3_receive_wr recv;
+	struct t3_local_inv_wr local_inv;
+	struct t3_bind_mw_wr bind;
+	struct t3_bypass_wr bypass;
+	struct t3_rdma_init_wr init;
+	struct t3_modify_qp_wr qp_mod;
+	struct t3_genbit genbit;
+	u64 flit[16];
+};
+
+#define T3_SQ_CQE_FLIT	  13
+#define T3_SQ_COOKIE_FLIT 14
+
+#define T3_RQ_COOKIE_FLIT 13
+#define T3_RQ_CQE_FLIT	  14
+
+static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe)
+{
+	return G_FW_RIWR_OP(be32toh(wqe->op_seop_flags));
+}
+
+static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op,
+				  enum t3_wr_flags flags, u8 genbit, u32 tid,
+				  u8 len)
+{
+	wqe->op_seop_flags = htobe32(V_FW_RIWR_OP(op) |
+					 V_FW_RIWR_SOPEOP(M_FW_RIWR_SOPEOP) |
+					 V_FW_RIWR_FLAGS(flags));
+	wmb();
+	wqe->gen_tid_len = htobe32(V_FW_RIWR_GEN(genbit) |
+				       V_FW_RIWR_TID(tid) |
+				       V_FW_RIWR_LEN(len));
+	/* 2nd gen bit... */
+	((union t3_wr *)wqe)->genbit.genbit = htobe64(genbit);
+}
+
+/*
+ * T3 ULP2_TX commands
+ */
+enum t3_utx_mem_op {
+	T3_UTX_MEM_READ = 2,
+	T3_UTX_MEM_WRITE = 3
+};
+
+/* T3 MC7 RDMA TPT entry format */
+
+enum tpt_mem_type {
+	TPT_NON_SHARED_MR = 0x0,
+	TPT_SHARED_MR = 0x1,
+	TPT_MW = 0x2,
+	TPT_MW_RELAXED_PROTECTION = 0x3
+};
+
+enum tpt_addr_type {
+	TPT_ZBTO = 0,
+	TPT_VATO = 1
+};
+
+enum tpt_mem_perm {
+	TPT_LOCAL_READ = 0x8,
+	TPT_LOCAL_WRITE = 0x4,
+	TPT_REMOTE_READ = 0x2,
+	TPT_REMOTE_WRITE = 0x1
+};
+
+struct tpt_entry {
+	__be32 valid_stag_pdid;
+	__be32 flags_pagesize_qpid;
+
+	__be32 rsvd_pbl_addr;
+	__be32 len;
+	__be32 va_hi;
+	__be32 va_low_or_fbo;
+
+	__be32 rsvd_bind_cnt_or_pstag;
+	__be32 rsvd_pbl_size;
+};
+
+#define S_TPT_VALID		31
+#define V_TPT_VALID(x)		((x) << S_TPT_VALID)
+#define F_TPT_VALID		V_TPT_VALID(1U)
+
+#define S_TPT_STAG_KEY		23
+#define M_TPT_STAG_KEY		0xFF
+#define V_TPT_STAG_KEY(x)	((x) << S_TPT_STAG_KEY)
+#define G_TPT_STAG_KEY(x)	(((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY)
+
+#define S_TPT_STAG_STATE	22
+#define V_TPT_STAG_STATE(x)	((x) << S_TPT_STAG_STATE)
+#define F_TPT_STAG_STATE	V_TPT_STAG_STATE(1U)
+
+#define S_TPT_STAG_TYPE		20
+#define M_TPT_STAG_TYPE		0x3
+#define V_TPT_STAG_TYPE(x)	((x) << S_TPT_STAG_TYPE)
+#define G_TPT_STAG_TYPE(x)	(((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE)
+
+#define S_TPT_PDID		0
+#define M_TPT_PDID		0xFFFFF
+#define V_TPT_PDID(x)		((x) << S_TPT_PDID)
+#define G_TPT_PDID(x)		(((x) >> S_TPT_PDID) & M_TPT_PDID)
+
+#define S_TPT_PERM		28
+#define M_TPT_PERM		0xF
+#define V_TPT_PERM(x)		((x) << S_TPT_PERM)
+#define G_TPT_PERM(x)		(((x) >> S_TPT_PERM) & M_TPT_PERM)
+
+#define S_TPT_REM_INV_DIS	27
+#define V_TPT_REM_INV_DIS(x)	((x) << S_TPT_REM_INV_DIS)
+#define F_TPT_REM_INV_DIS	V_TPT_REM_INV_DIS(1U)
+
+#define S_TPT_ADDR_TYPE		26
+#define V_TPT_ADDR_TYPE(x)	((x) << S_TPT_ADDR_TYPE)
+#define F_TPT_ADDR_TYPE		V_TPT_ADDR_TYPE(1U)
+
+#define S_TPT_MW_BIND_ENABLE	25
+#define V_TPT_MW_BIND_ENABLE(x)	((x) << S_TPT_MW_BIND_ENABLE)
+#define F_TPT_MW_BIND_ENABLE    V_TPT_MW_BIND_ENABLE(1U)
+
+#define S_TPT_PAGE_SIZE		20
+#define M_TPT_PAGE_SIZE		0x1F
+#define V_TPT_PAGE_SIZE(x)	((x) << S_TPT_PAGE_SIZE)
+#define G_TPT_PAGE_SIZE(x)	(((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE)
+
+#define S_TPT_PBL_ADDR		0
+#define M_TPT_PBL_ADDR		0x1FFFFFFF
+#define V_TPT_PBL_ADDR(x)	((x) << S_TPT_PBL_ADDR)
+#define G_TPT_PBL_ADDR(x)       (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR)
+
+#define S_TPT_QPID		0
+#define M_TPT_QPID		0xFFFFF
+#define V_TPT_QPID(x)		((x) << S_TPT_QPID)
+#define G_TPT_QPID(x)		(((x) >> S_TPT_QPID) & M_TPT_QPID)
+
+#define S_TPT_PSTAG		0
+#define M_TPT_PSTAG		0xFFFFFF
+#define V_TPT_PSTAG(x)		((x) << S_TPT_PSTAG)
+#define G_TPT_PSTAG(x)		(((x) >> S_TPT_PSTAG) & M_TPT_PSTAG)
+
+#define S_TPT_PBL_SIZE		0
+#define M_TPT_PBL_SIZE		0xFFFFF
+#define V_TPT_PBL_SIZE(x)	((x) << S_TPT_PBL_SIZE)
+#define G_TPT_PBL_SIZE(x)	(((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE)
+
+/*
+ * CQE defs
+ */
+struct t3_cqe {
+	__be32 header;
+	__be32 len;
+	union {
+		struct {
+			__be32 stag;
+			__be32 msn;
+		} rcqe;
+		struct {
+			u32 wrid_hi;
+			u32 wrid_low;
+		} scqe;
+	} u;
+};
+
+#define S_CQE_OOO	  31
+#define M_CQE_OOO	  0x1
+#define G_CQE_OOO(x)	  ((((x) >> S_CQE_OOO)) & M_CQE_OOO)
+#define V_CEQ_OOO(x)	  ((x)<<S_CQE_OOO)
+
+#define S_CQE_QPID        12
+#define M_CQE_QPID        0x7FFFF
+#define G_CQE_QPID(x)     ((((x) >> S_CQE_QPID)) & M_CQE_QPID)
+#define V_CQE_QPID(x)	  ((x)<<S_CQE_QPID)
+
+#define S_CQE_SWCQE       11
+#define M_CQE_SWCQE       0x1
+#define G_CQE_SWCQE(x)    ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE)
+#define V_CQE_SWCQE(x)	  ((x)<<S_CQE_SWCQE)
+
+#define S_CQE_GENBIT      10
+#define M_CQE_GENBIT      0x1
+#define G_CQE_GENBIT(x)   (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT)
+#define V_CQE_GENBIT(x)	  ((x)<<S_CQE_GENBIT)
+
+#define S_CQE_STATUS      5
+#define M_CQE_STATUS      0x1F
+#define G_CQE_STATUS(x)   ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS)
+#define V_CQE_STATUS(x)   ((x)<<S_CQE_STATUS)
+
+#define S_CQE_TYPE        4
+#define M_CQE_TYPE        0x1
+#define G_CQE_TYPE(x)     ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE)
+#define V_CQE_TYPE(x)     ((x)<<S_CQE_TYPE)
+
+#define S_CQE_OPCODE      0
+#define M_CQE_OPCODE      0xF
+#define G_CQE_OPCODE(x)   ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE)
+#define V_CQE_OPCODE(x)   ((x)<<S_CQE_OPCODE)
+
+#define SW_CQE(x)         (G_CQE_SWCQE(be32toh((x).header)))
+#define CQE_OOO(x)        (G_CQE_OOO(be32toh((x).header)))
+#define CQE_QPID(x)       (G_CQE_QPID(be32toh((x).header)))
+#define CQE_GENBIT(x)     (G_CQE_GENBIT(be32toh((x).header)))
+#define CQE_TYPE(x)       (G_CQE_TYPE(be32toh((x).header)))
+#define SQ_TYPE(x)	  (CQE_TYPE((x)))
+#define RQ_TYPE(x)	  (!CQE_TYPE((x)))
+#define CQE_STATUS(x)     (G_CQE_STATUS(be32toh((x).header)))
+#define CQE_OPCODE(x)     (G_CQE_OPCODE(be32toh((x).header)))
+
+#define CQE_LEN(x)        (be32toh((x).len))
+
+/* used for RQ completion processing */
+#define CQE_WRID_STAG(x)  (be32toh((x).u.rcqe.stag))
+#define CQE_WRID_MSN(x)   (be32toh((x).u.rcqe.msn))
+
+/* used for SQ completion processing */
+#define CQE_WRID_SQ_WPTR(x)	((x).u.scqe.wrid_hi)
+#define CQE_WRID_WPTR(x)	((x).u.scqe.wrid_low)
+
+/* generic accessor macros */
+#define CQE_WRID_HI(x)		((x).u.scqe.wrid_hi)
+#define CQE_WRID_LOW(x)		((x).u.scqe.wrid_low)
+
+#define TPT_ERR_SUCCESS                     0x0
+#define TPT_ERR_STAG                        0x1	 /* STAG invalid: either the */
+						 /* STAG is offlimt, being 0, */
+						 /* or STAG_key mismatch */
+#define TPT_ERR_PDID                        0x2	 /* PDID mismatch */
+#define TPT_ERR_QPID                        0x3	 /* QPID mismatch */
+#define TPT_ERR_ACCESS                      0x4	 /* Invalid access right */
+#define TPT_ERR_WRAP                        0x5	 /* Wrap error */
+#define TPT_ERR_BOUND                       0x6	 /* base and bounds voilation */
+#define TPT_ERR_INVALIDATE_SHARED_MR        0x7	 /* attempt to invalidate a  */
+						 /* shared memory region */
+#define TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8	 /* attempt to invalidate a  */
+						 /* shared memory region */
+#define TPT_ERR_ECC                         0x9	 /* ECC error detected */
+#define TPT_ERR_ECC_PSTAG                   0xA	 /* ECC error detected when  */
+						 /* reading PSTAG for a MW  */
+						 /* Invalidate */
+#define TPT_ERR_PBL_ADDR_BOUND              0xB	 /* pbl addr out of bounds:  */
+						 /* software error */
+#define TPT_ERR_SWFLUSH			    0xC	 /* SW FLUSHED */
+#define TPT_ERR_CRC                         0x10 /* CRC error */
+#define TPT_ERR_MARKER                      0x11 /* Marker error */
+#define TPT_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
+#define TPT_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
+#define TPT_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
+#define TPT_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
+#define TPT_ERR_OPCODE                      0x16 /* invalid rdma opcode */
+#define TPT_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
+#define TPT_ERR_MSN                         0x18 /* MSN error */
+#define TPT_ERR_TBIT                        0x19 /* tag bit not set correctly */
+#define TPT_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
+						 /* or READ_REQ */
+#define TPT_ERR_MSN_GAP                     0x1B
+#define TPT_ERR_MSN_RANGE                   0x1C
+#define TPT_ERR_IRD_OVERFLOW                0x1D
+#define TPT_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
+						 /* software error */
+#define TPT_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
+						 /* mismatch) */
+
+struct t3_swsq {
+	uint64_t		wr_id;
+	struct t3_cqe		cqe;
+	uint32_t		sq_wptr;
+	uint32_t		read_len;
+	int			opcode;
+	int			complete;
+	int			signaled;
+};
+
+/*
+ * A T3 WQ implements both the SQ and RQ.
+ */
+struct t3_wq {
+	union t3_wr *queue;		/* DMA accessable memory */
+	bus_addr_t dma_addr;		/* DMA address for HW */
+#ifdef notyet	
+	DECLARE_PCI_UNMAP_ADDR(mapping)	/* unmap kruft */
+#endif		
+	u32 error;			/* 1 once we go to ERROR */
+	u32 qpid;
+	u32 wptr;			/* idx to next available WR slot */
+	u32 size_log2;			/* total wq size */
+	struct t3_swsq *sq;		/* SW SQ */
+	struct t3_swsq *oldest_read;	/* tracks oldest pending read */
+	u32 sq_wptr;			/* sq_wptr - sq_rptr == count of */
+	u32 sq_rptr;			/* pending wrs */
+	u32 sq_size_log2;		/* sq size */
+	u64 *rq;			/* SW RQ (holds consumer wr_ids */
+	u32 rq_wptr;			/* rq_wptr - rq_rptr == count of */
+	u32 rq_rptr;			/* pending wrs */
+	u64 *rq_oldest_wr;		/* oldest wr on the SW RQ */
+	u32 rq_size_log2;		/* rq size */
+	u32 rq_addr;			/* rq adapter address */
+	void /* __iomem */ *doorbell;	/* kernel db */
+	u64 udb;			/* user db if any */
+};
+
+struct t3_cq {
+	u32 cqid;
+	u32 rptr;
+	u32 wptr;
+	u32 size_log2;
+	bus_addr_t dma_addr;
+#ifdef notyet	
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+#endif		
+	struct t3_cqe *queue;
+	struct t3_cqe *sw_queue;
+	u32 sw_rptr;
+	u32 sw_wptr;
+};
+
+#define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \
+					 CQE_GENBIT(*cqe))
+
+static inline void cxio_set_wq_in_error(struct t3_wq *wq)
+{
+	wq->queue->flit[13] = 1;
+}
+
+static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+		return cqe;
+	return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+		return cqe;
+	}
+	return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+		return cqe;
+	}
+	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+		return cqe;
+	return NULL;
+}
+
+#endif
diff --git a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
index 6c97a27f674b8..56ccda949beb8 100644
--- a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
+++ b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h
@@ -31,7 +31,10 @@ $FreeBSD$
 ***************************************************************************/
 
 #ifndef _CXGB_TOEDEV_H_
-#define _CXGB_TOEDEV_H_ 
+#define _CXGB_TOEDEV_H_
+#ifdef notyet
+#include <netinet/toedev.h>
+#endif
 
 /* offload type ids */
 enum {
diff --git a/sys/dev/cxgb/ulp/toecore/toedev.c b/sys/dev/cxgb/ulp/toecore/toedev.c
new file mode 100644
index 0000000000000..07a0d6e94feb9
--- /dev/null
+++ b/sys/dev/cxgb/ulp/toecore/toedev.c
@@ -0,0 +1,424 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/module.h>
+#include <sys/queue.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+
+#include <sys/socket.h>
+#include <sys/sockio.h>
+
+#include <net/bpf.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/route.h>
+
+
+/*
+ * XXX 
+ */
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#endif
+
+
+
+static struct mtx offload_db_lock;
+static TAILQ_HEAD(, toedev) offload_dev_list;
+static TAILQ_HEAD(, tom_info) offload_module_list;
+
+/*
+ * Returns the entry in the given table with the given offload id, or NULL
+ * if the id is not found.
+ */
+static const struct offload_id *
+id_find(unsigned int id, const struct offload_id *table)
+{
+	for ( ; table->id; ++table)
+		if (table->id == id)
+			return table;
+	return NULL;
+}
+
+/*
+ * Returns true if an offload device is presently attached to an offload module.
+ */
+static inline int
+is_attached(const struct toedev *dev)
+{
+	return dev->tod_offload_mod != NULL;
+}
+
+/*
+ * Try to attach a new offload device to an existing TCP offload module that
+ * can handle the device's offload id.  Returns 0 if it succeeds.
+ *
+ * Must be called with the offload_db_lock held.
+ */
+static int
+offload_attach(struct toedev *dev)
+{
+	struct tom_info *t;
+
+	TAILQ_FOREACH(t, &offload_module_list, entry) {
+		const struct offload_id *entry;
+
+		entry = id_find(dev->tod_ttid, t->ti_id_table);
+		if (entry && t->ti_attach(dev, entry) == 0) {
+			dev->tod_offload_mod = t;
+			return 0;
+		}
+	}
+	return (ENOPROTOOPT);
+}
+
+/**
+ * register_tom - register a TCP Offload Module (TOM)
+ * @t: the offload module to register
+ *
+ * Register a TCP Offload Module (TOM).
+ */
+int
+register_tom(struct tom_info *t)
+{
+	mtx_lock(&offload_db_lock);
+	TAILQ_INSERT_HEAD(&offload_module_list, t, entry);
+	mtx_unlock(&offload_db_lock);
+	return 0;
+}
+
+/**
+ * unregister_tom - unregister a TCP Offload Module (TOM)
+ * @t: the offload module to register
+ *
+ * Unregister a TCP Offload Module (TOM).  Note that this does not affect any
+ * TOE devices to which the TOM is already attached.
+ */
+int
+unregister_tom(struct tom_info *t)
+{
+	mtx_lock(&offload_db_lock);
+	TAILQ_REMOVE(&offload_module_list, t, entry);
+	mtx_unlock(&offload_db_lock);
+	return 0;
+}
+
+/*
+ * Find an offload device by name.  Must be called with offload_db_lock held.
+ */
+static struct toedev *
+__find_offload_dev_by_name(const char *name)
+{
+	struct toedev *dev;
+
+	TAILQ_FOREACH(dev, &offload_dev_list, entry) {
+		if (!strncmp(dev->tod_name, name, TOENAMSIZ))
+			return dev;
+	}
+	return NULL;
+}
+
+/*
+ * Returns true if an offload device is already registered.
+ * Must be called with the offload_db_lock held.
+ */
+static int
+is_registered(const struct toedev *dev)
+{
+	struct toedev *d;
+
+	TAILQ_FOREACH(d, &offload_dev_list, entry) {
+		if (d == dev)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Finalize the name of an offload device by assigning values to any format
+ * strings in its name.
+ */
+static int
+assign_name(struct toedev *dev, const char *name, int limit)
+{
+	int i;
+
+	for (i = 0; i < limit; ++i) {
+		char s[TOENAMSIZ];
+
+		if (snprintf(s, sizeof(s), name, i) >= sizeof(s))
+			return -1;                  /* name too long */
+		if (!__find_offload_dev_by_name(s)) {
+			strcpy(dev->tod_name, s);
+			return 0;
+		}
+	}
+	return -1;
+}
+
+/**
+ * register_toedev - register a TOE device
+ * @dev: the device
+ * @name: a name template for the device
+ *
+ * Register a TOE device and try to attach an appropriate TCP offload module
+ * to it.  @name is a template that may contain at most one %d format
+ * specifier.
+ */
+int
+register_toedev(struct toedev *dev, const char *name)
+{
+	int ret;
+	const char *p;
+
+	/*
+	 * Validate the name template.  Only one %d allowed and name must be
+	 * a valid filename so it can appear in sysfs.
+	 */
+	if (!name || !*name || !strcmp(name, ".") || !strcmp(name, "..") ||
+	    strchr(name, '/'))
+		return EINVAL;
+
+	p = strchr(name, '%');
+	if (p && (p[1] != 'd' || strchr(p + 2, '%')))
+		return EINVAL;
+
+	mtx_lock(&offload_db_lock);
+	if (is_registered(dev)) {  /* device already registered */
+		ret = EEXIST;
+		goto out;
+	}
+
+	if ((ret = assign_name(dev, name, 32)) != 0)
+		goto out;
+
+	dev->tod_offload_mod = NULL;
+	TAILQ_INSERT_TAIL(&offload_dev_list, dev, entry);
+out:
+	mtx_unlock(&offload_db_lock);
+	return ret;
+}
+
+/**
+ * unregister_toedev - unregister a TOE device
+ * @dev: the device
+ *
+ * Unregister a TOE device.  The device must not be attached to an offload
+ * module.
+ */
+int
+unregister_toedev(struct toedev *dev)
+{
+	int ret = 0;
+
+	mtx_lock(&offload_db_lock);
+	if (!is_registered(dev)) {
+		ret = ENODEV;
+		goto out;
+	}
+	if (is_attached(dev)) {
+		ret = EBUSY;
+		goto out;
+	}
+	TAILQ_REMOVE(&offload_dev_list, dev, entry);
+out:
+	mtx_unlock(&offload_db_lock);
+	return ret;
+}
+
+/**
+ * activate_offload - activate an offload device
+ * @dev: the device
+ *
+ * Activate an offload device by locating an appropriate registered offload
+ * module.  If no module is found the operation fails and may be retried at
+ * a later time.
+ */
+int
+activate_offload(struct toedev *dev)
+{
+	int ret = 0;
+
+	mtx_lock(&offload_db_lock);
+	if (!is_registered(dev))
+		ret = ENODEV;
+	else if (!is_attached(dev))
+		ret = offload_attach(dev);
+	mtx_unlock(&offload_db_lock);
+	return ret;
+}
+
+/**
+ * toe_send - send a packet to a TOE device
+ * @dev: the device
+ * @m: the packet
+ *
+ * Sends an mbuf to a TOE driver after dealing with any active network taps.
+ */
+int
+toe_send(struct toedev *dev, struct mbuf *m)
+{
+	int r;
+
+	critical_enter(); /* XXX neccessary? */
+	r = dev->tod_send(dev, m);
+	critical_exit();
+	if (r)
+		BPF_MTAP(dev->tod_lldev, m);
+	return r;
+}
+
+/**
+ * toe_receive_mbuf - process n received TOE packets
+ * @dev: the toe device
+ * @m: an array of offload packets
+ * @n: the number of offload packets
+ *
+ * Process an array of ingress offload packets.  Each packet is forwarded
+ * to any active network taps and then passed to the toe device's receive
+ * method.  We optimize passing packets to the receive method by passing
+ * it the whole array at once except when there are active taps.
+ */
+int
+toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n)
+{
+	if (__predict_true(!bpf_peers_present(dev->tod_lldev->if_bpf)))
+		return dev->tod_recv(dev, m, n);
+
+	for ( ; n; n--, m++) {
+		m[0]->m_pkthdr.rcvif = dev->tod_lldev;
+		BPF_MTAP(dev->tod_lldev, m[0]);
+		dev->tod_recv(dev, m, 1);
+	}
+	return 0;
+}
+
+static inline int
+ifnet_is_offload(const struct ifnet *ifp)
+{
+	return (ifp->if_flags & IFCAP_TOE);
+}
+
+void
+toe_arp_update(struct rtentry *rt)
+{
+	struct ifnet *ifp = rt->rt_ifp;
+
+	if (ifp && ifnet_is_offload(ifp)) {
+		struct toedev *tdev = TOEDEV(ifp);
+
+		if (tdev && tdev->tod_arp_update)
+			tdev->tod_arp_update(tdev, rt);
+	}
+}
+
+/**
+ * offload_get_phys_egress - find the physical egress device
+ * @root_dev: the root device anchoring the search
+ * @so: the socket used to determine egress port in bonding mode
+ * @context: in bonding mode, indicates a connection set up or failover
+ *
+ * Given a root network device it returns the physical egress device that is a
+ * descendant of the root device.  The root device may be either a physical
+ * device, in which case it is the device returned, or a virtual device, such
+ * as a VLAN or bonding device.  In case of a bonding device the search
+ * considers the decisions of the bonding device given its mode to locate the
+ * correct egress device.
+ */
+struct ifnet *
+offload_get_phys_egress(struct ifnet *root_dev, struct socket *so, int context)
+{
+
+#if 0
+	while (root_dev && ifnet_is_offload(root_dev)) {
+		if (root_dev->tod_priv_flags & IFF_802_1Q_VLAN)
+			root_dev = VLAN_DEV_INFO(root_dev)->real_dev;
+		else if (root_dev->tod_flags & IFF_MASTER)
+			root_dev = toe_bond_get_slave(root_dev, sk, context);
+		else
+			break;
+	}
+#endif
+	return root_dev;
+}
+
+static int
+toecore_load(module_t mod, int cmd, void *arg)
+{
+	int err = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		mtx_init(&offload_db_lock, "toedev lock", NULL, MTX_DEF);
+		TAILQ_INIT(&offload_dev_list);
+		TAILQ_INIT(&offload_module_list);
+		break;
+	case MOD_QUIESCE:
+		break;
+	case MOD_UNLOAD:
+		mtx_lock(&offload_db_lock);
+		if (!TAILQ_EMPTY(&offload_dev_list) ||
+		    !TAILQ_EMPTY(&offload_module_list)) {
+			err = EBUSY;
+			mtx_unlock(&offload_db_lock);
+			break;
+		}
+		mtx_unlock(&offload_db_lock);
+		mtx_destroy(&offload_db_lock);
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		err = EOPNOTSUPP;
+		break;
+	}
+
+	return (err);
+}
+
+
+static moduledata_t mod_data= {
+	"toecore",
+	toecore_load,
+	0
+};
+
+MODULE_VERSION(toecore, 1);
+DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
new file mode 100644
index 0000000000000..00b45750e752b
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
@@ -0,0 +1,4456 @@
+/**************************************************************************
+
+Copyright (c) 2007-2008, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/protosw.h>
+#include <sys/priv.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_offload.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_timer.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <machine/bus.h>
+#include <dev/cxgb/sys/mvec.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+
+#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
+
+/*
+ * For ULP connections HW may add headers, e.g., for digests, that aren't part
+ * of the messages sent by the host but that are part of the TCP payload and
+ * therefore consume TCP sequence space.  Tx connection parameters that
+ * operate in TCP sequence space are affected by the HW additions and need to
+ * compensate for them to accurately track TCP sequence numbers. This array
+ * contains the compensating extra lengths for ULP packets.  It is indexed by
+ * a packet's ULP submode.
+ */
+const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
+
+#ifdef notyet
+/*
+ * This sk_buff holds a fake header-only TCP segment that we use whenever we
+ * need to exploit SW TCP functionality that expects TCP headers, such as
+ * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
+ * CPUs without locking.
+ */
+static struct mbuf *tcphdr_mbuf __read_mostly;
+#endif
+
+/*
+ * Size of WRs in bytes.  Note that we assume all devices we are handling have
+ * the same WR size.
+ */
+static unsigned int wrlen __read_mostly;
+
+/*
+ * The number of WRs needed for an skb depends on the number of page fragments
+ * in the skb and whether it has any payload in its main body.  This maps the
+ * length of the gather list represented by an skb into the # of necessary WRs.
+ */
+static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
+
+/*
+ * Max receive window supported by HW in bytes.  Only a small part of it can
+ * be set through option0, the rest needs to be set through RX_DATA_ACK.
+ */
+#define MAX_RCV_WND ((1U << 27) - 1)
+
+/*
+ * Min receive window.  We want it to be large enough to accommodate receive
+ * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
+ */
+#define MIN_RCV_WND (24 * 1024U)
+#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
+
+#define VALIDATE_SEQ 0
+#define VALIDATE_SOCK(so)
+#define DEBUG_WR 0
+
+#define TCP_TIMEWAIT	1
+#define TCP_CLOSE	2
+#define TCP_DROP	3
+
+extern int tcp_do_autorcvbuf;
+extern int tcp_do_autosndbuf;
+extern int tcp_autorcvbuf_max;
+extern int tcp_autosndbuf_max;
+
+static void t3_send_reset(struct toepcb *toep);
+static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
+static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
+static void handle_syncache_event(int event, void *arg);
+
+static inline void
+SBAPPEND(struct sockbuf *sb, struct mbuf *n)
+{
+	struct mbuf *m;
+
+	m = sb->sb_mb;
+	while (m) {
+		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
+		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
+			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
+		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
+			m->m_next, m->m_nextpkt, m->m_flags));
+		m = m->m_next;
+	}
+	m = n;
+	while (m) {
+		KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
+		    !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
+			!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
+		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
+			m->m_next, m->m_nextpkt, m->m_flags));
+		m = m->m_next;
+	}
+	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
+	sbappendstream_locked(sb, n);
+	m = sb->sb_mb;
+
+	while (m) {
+		KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
+			m->m_next, m->m_nextpkt, m->m_flags));
+		m = m->m_next;
+	}
+}
+
+static inline int
+is_t3a(const struct toedev *dev)
+{
+	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
+}
+
+static void
+dump_toepcb(struct toepcb *toep)
+{
+	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
+	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
+	    toep->tp_mtu_idx, toep->tp_tid);
+
+	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
+	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 
+	    toep->tp_mss_clamp, toep->tp_flags);
+}
+
+#ifndef RTALLOC2_DEFINED
+static struct rtentry *
+rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
+{
+	struct rtentry *rt = NULL;
+	
+	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
+		RT_UNLOCK(rt);
+
+	return (rt);
+}
+#endif
+
+/*
+ * Determine whether to send a CPL message now or defer it.  A message is
+ * deferred if the connection is in SYN_SENT since we don't know the TID yet.
+ * For connections in other states the message is sent immediately.
+ * If through_l2t is set the message is subject to ARP processing, otherwise
+ * it is sent directly.
+ */
+static inline void
+send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
+{
+	struct tcpcb *tp = toep->tp_tp;
+
+	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
+		inp_wlock(tp->t_inpcb);
+		mbufq_tail(&toep->out_of_order_queue, m);  // defer
+		inp_wunlock(tp->t_inpcb);
+	} else if (through_l2t)
+		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
+	else
+		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
+}
+
+static inline unsigned int
+mkprio(unsigned int cntrl, const struct toepcb *toep)
+{
+        return (cntrl);
+}
+
+/*
+ * Populate a TID_RELEASE WR.  The skb must be already propely sized.
+ */
+static inline void
+mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
+{
+	struct cpl_tid_release *req;
+
+	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req = mtod(m, struct cpl_tid_release *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+}
+
+static inline void
+make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct tx_data_wr *req;
+	struct sockbuf *snd;
+	
+	inp_lock_assert(tp->t_inpcb);
+	snd = so_sockbuf_snd(so);
+	
+	req = mtod(m, struct tx_data_wr *);
+	m->m_len = sizeof(*req);
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
+	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
+	/* len includes the length of any HW ULP additions */
+	req->len = htonl(len);
+	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
+	/* V_TX_ULP_SUBMODE sets both the mode and submode */
+	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
+	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
+	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
+				   (tail ? 0 : 1))));
+	req->sndseq = htonl(tp->snd_nxt);
+	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
+		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 
+				    V_TX_CPU_IDX(toep->tp_qset));
+ 
+		/* Sendbuffer is in units of 32KB.
+		 */
+		if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 
+			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
+		else {
+			req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
+		}
+		
+		toep->tp_flags |= TP_DATASENT;
+	}
+}
+
+#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
+
+int
+t3_push_frames(struct socket *so, int req_completion)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	struct mbuf *tail, *m0, *last;
+	struct t3cdev *cdev;
+	struct tom_data *d;
+	int state, bytes, count, total_bytes;
+	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
+	struct sockbuf *snd;
+	
+	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
+		DPRINTF("tcp state=%d\n", tp->t_state);	
+		return (0);
+	}	
+
+	state = so_state_get(so);
+	
+	if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
+		DPRINTF("disconnecting\n");
+		
+		return (0);
+	}
+
+	inp_lock_assert(tp->t_inpcb);
+
+	snd = so_sockbuf_snd(so);
+	sockbuf_lock(snd);
+
+	d = TOM_DATA(toep->tp_toedev);
+	cdev = d->cdev;
+
+	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
+
+	total_bytes = 0;
+	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
+	    toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
+
+	if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
+		KASSERT(tail, ("sbdrop error"));
+		last = tail = tail->m_next;
+	}
+
+	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
+		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
+		sockbuf_unlock(snd);
+
+		return (0);		
+	}
+			
+	toep->tp_m_last = NULL;
+	while (toep->tp_wr_avail && (tail != NULL)) {
+		count = bytes = 0;
+		segp = segs;
+		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
+			sockbuf_unlock(snd);
+			return (0);
+		}
+		/*
+		 * If the data in tail fits as in-line, then
+		 * make an immediate data wr.
+		 */
+		if (tail->m_len <= IMM_LEN) {
+			count = 1;
+			bytes = tail->m_len;
+			last = tail;
+			tail = tail->m_next;
+			m_set_sgl(m0, NULL);
+			m_set_sgllen(m0, 0);
+			make_tx_data_wr(so, m0, bytes, tail);
+			m_append(m0, bytes, mtod(last, caddr_t));
+			KASSERT(!m0->m_next, ("bad append"));
+		} else {
+			while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
+			    && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
+				bytes += tail->m_len;
+				last = tail;
+				count++;
+				/*
+				 * technically an abuse to be using this for a VA
+				 * but less gross than defining my own structure
+				 * or calling pmap_kextract from here :-|
+				 */
+				segp->ds_addr = (bus_addr_t)tail->m_data;
+				segp->ds_len = tail->m_len;
+				DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
+				    count, mbuf_wrs[count], tail->m_data, tail->m_len);
+				segp++;
+				tail = tail->m_next;
+			}
+			DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
+			    toep->tp_wr_avail, count, mbuf_wrs[count], tail);	
+
+			m_set_sgl(m0, segs);
+			m_set_sgllen(m0, count);
+			make_tx_data_wr(so, m0, bytes, tail);
+		}
+		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
+
+		if (tail) {
+			snd->sb_sndptr = tail;
+			toep->tp_m_last = NULL;
+		} else 
+			toep->tp_m_last = snd->sb_sndptr = last;
+
+
+		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
+
+		snd->sb_sndptroff += bytes;
+		total_bytes += bytes;
+		toep->tp_write_seq += bytes;
+		CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
+		    " tail=%p sndptr=%p sndptroff=%d",
+		    toep->tp_wr_avail, count, mbuf_wrs[count],
+		    tail, snd->sb_sndptr, snd->sb_sndptroff);	
+		if (tail)
+			CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
+			    " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
+			    total_bytes, toep->tp_m_last, tail->m_data,
+			    tp->snd_una);
+		else
+			CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
+			    " tp_m_last=%p snd_una=0x%08x",
+			    total_bytes, toep->tp_m_last, tp->snd_una);
+
+
+#ifdef KTR		
+{
+		int i;
+
+		i = 0;
+		while (i < count && m_get_sgllen(m0)) {
+			if ((count - i) >= 3) {
+				CTR6(KTR_TOM,
+				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
+				    " len=%d pa=0x%zx len=%d",
+				    segs[i].ds_addr, segs[i].ds_len,
+				    segs[i + 1].ds_addr, segs[i + 1].ds_len,
+				    segs[i + 2].ds_addr, segs[i + 2].ds_len);
+				    i += 3;
+			} else if ((count - i) == 2) {
+				CTR4(KTR_TOM, 
+				    "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
+				    " len=%d",
+				    segs[i].ds_addr, segs[i].ds_len,
+				    segs[i + 1].ds_addr, segs[i + 1].ds_len);
+				    i += 2;
+			} else {
+				CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
+				    segs[i].ds_addr, segs[i].ds_len);
+				i++;
+			}
+	
+		}
+}
+#endif		
+                 /*
+		 * remember credits used
+		 */
+		m0->m_pkthdr.csum_data = mbuf_wrs[count];
+		m0->m_pkthdr.len = bytes;
+		toep->tp_wr_avail -= mbuf_wrs[count];
+		toep->tp_wr_unacked += mbuf_wrs[count];
+		
+		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
+		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
+			struct work_request_hdr *wr = cplhdr(m0);
+
+			wr->wr_hi |= htonl(F_WR_COMPL);
+			toep->tp_wr_unacked = 0;	
+		}
+		KASSERT((m0->m_pkthdr.csum_data > 0) &&
+		    (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
+			m0->m_pkthdr.csum_data));
+		m0->m_type = MT_DONTFREE;
+		enqueue_wr(toep, m0);
+		DPRINTF("sending offload tx with %d bytes in %d segments\n",
+		    bytes, count);
+		l2t_send(cdev, m0, toep->tp_l2t);
+	}
+	sockbuf_unlock(snd);
+	return (total_bytes);
+}
+
+/*
+ * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
+ * under any circumstances.  We take the easy way out and always queue the
+ * message to the write_queue.  We can optimize the case where the queue is
+ * already empty though the optimization is probably not worth it.
+ */
+static void
+close_conn(struct socket *so)
+{
+	struct mbuf *m;
+	struct cpl_close_con_req *req;
+	struct tom_data *d;
+	struct inpcb *inp = so_sotoinpcb(so);
+	struct tcpcb *tp;
+	struct toepcb *toep;
+	unsigned int tid; 
+
+
+	inp_wlock(inp);
+	tp = so_sototcpcb(so);
+	toep = tp->t_toe;
+	
+	if (tp->t_state != TCPS_SYN_SENT)
+		t3_push_frames(so, 1);
+	
+	if (toep->tp_flags & TP_FIN_SENT) {
+		inp_wunlock(inp);
+		return;
+	}
+
+	tid = toep->tp_tid;
+	    
+	d = TOM_DATA(toep->tp_toedev);
+	
+	m = m_gethdr_nofail(sizeof(*req));
+	m_set_priority(m, CPL_PRIORITY_DATA);
+	m_set_sgl(m, NULL);
+	m_set_sgllen(m, 0);
+
+	toep->tp_flags |= TP_FIN_SENT;
+	req = mtod(m, struct cpl_close_con_req *);
+	
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+	req->wr.wr_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = 0;
+	inp_wunlock(inp);
+	/*
+	 * XXX - need to defer shutdown while there is still data in the queue
+	 *
+	 */
+	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
+	cxgb_ofld_send(d->cdev, m);
+
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void
+abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
+{
+	struct cpl_abort_req *req = cplhdr(m);
+
+	req->cmd = CPL_ABORT_NO_RST;
+	cxgb_ofld_send(cdev, m);
+}
+
+/*
+ * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
+ * permitted to return without sending the message in case we cannot allocate
+ * an sk_buff.  Returns the number of credits sent.
+ */
+uint32_t
+t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
+{
+	struct mbuf *m;
+	struct cpl_rx_data_ack *req;
+	struct toepcb *toep = tp->t_toe;
+	struct toedev *tdev = toep->tp_toedev;
+	
+	m = m_gethdr_nofail(sizeof(*req));
+
+	DPRINTF("returning %u credits to HW\n", credits);
+	
+	req = mtod(m, struct cpl_rx_data_ack *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
+	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
+	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 
+	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+	return (credits);
+}
+
+/*
+ * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
+ * This is only used in DDP mode, so we take the opportunity to also set the
+ * DACK mode and flush any Rx credits.
+ */
+void
+t3_send_rx_modulate(struct toepcb *toep)
+{
+	struct mbuf *m;
+	struct cpl_rx_data_ack *req;
+
+	m = m_gethdr_nofail(sizeof(*req));
+
+	req = mtod(m, struct cpl_rx_data_ack *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
+	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
+				 V_RX_DACK_MODE(1) |
+				 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+	toep->tp_rcv_wup = toep->tp_copied_seq;
+}
+
+/*
+ * Handle receipt of an urgent pointer.
+ */
+static void
+handle_urg_ptr(struct socket *so, uint32_t urg_seq)
+{
+#ifdef URGENT_DATA_SUPPORTED
+	struct tcpcb *tp = so_sototcpcb(so);
+
+	urg_seq--;   /* initially points past the urgent data, per BSD */
+
+	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
+		return;                                 /* duplicate pointer */
+	sk_send_sigurg(sk);
+	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
+		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+		tp->copied_seq++;
+		if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
+			tom_eat_skb(sk, skb, 0);
+	}
+	tp->urg_data = TCP_URG_NOTYET;
+	tp->urg_seq = urg_seq;
+#endif
+}
+
+/*
+ * Returns true if a socket cannot accept new Rx data.
+ */
+static inline int
+so_no_receive(const struct socket *so)
+{
+	return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
+}
+
+/*
+ * Process an urgent data notification.
+ */
+static void
+rx_urg_notify(struct toepcb *toep, struct mbuf *m)
+{
+	struct cpl_rx_urg_notify *hdr = cplhdr(m);
+	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
+
+	VALIDATE_SOCK(so);
+
+	if (!so_no_receive(so))
+		handle_urg_ptr(so, ntohl(hdr->seq));
+
+	m_freem(m);
+}
+
+/*
+ * Handler for RX_URG_NOTIFY CPL messages.
+ */
+static int
+do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	rx_urg_notify(toep, m);
+	return (0);
+}
+
+static __inline int
+is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
+{
+	return (toep->tp_ulp_mode ||
+		(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
+		    dev->tod_ttid >= TOE_ID_CHELSIO_T3));
+}
+
+/*
+ * Set of states for which we should return RX credits.
+ */
+#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
+
+/*
+ * Called after some received data has been read.  It returns RX credits
+ * to the HW for the amount of data processed.
+ */
+void
+t3_cleanup_rbuf(struct tcpcb *tp, int copied)
+{
+	struct toepcb *toep = tp->t_toe;
+	struct socket *so;
+	struct toedev *dev;
+	int dack_mode, must_send, read;
+	u32 thres, credits, dack = 0;
+	struct sockbuf *rcv;
+	
+	so = inp_inpcbtosocket(tp->t_inpcb);
+	rcv = so_sockbuf_rcv(so);
+
+	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
+		(tp->t_state == TCPS_FIN_WAIT_2))) {
+		if (copied) {
+			sockbuf_lock(rcv);
+			toep->tp_copied_seq += copied;
+			sockbuf_unlock(rcv);
+		}
+		
+		return;
+	}
+	
+	inp_lock_assert(tp->t_inpcb); 
+
+	sockbuf_lock(rcv);
+	if (copied)
+		toep->tp_copied_seq += copied;
+	else {
+		read = toep->tp_enqueued_bytes - rcv->sb_cc;
+		toep->tp_copied_seq += read;
+	}
+	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
+	toep->tp_enqueued_bytes = rcv->sb_cc;
+	sockbuf_unlock(rcv);
+
+	if (credits > rcv->sb_mbmax) {
+		log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
+		    toep->tp_copied_seq, toep->tp_rcv_wup, credits);
+	    credits = rcv->sb_mbmax;
+	}
+	
+	    
+	/*
+	 * XXX this won't accurately reflect credit return - we need
+	 * to look at the difference between the amount that has been 
+	 * put in the recv sockbuf and what is there now
+	 */
+
+	if (__predict_false(!credits))
+		return;
+
+	dev = toep->tp_toedev;
+	thres = TOM_TUNABLE(dev, rx_credit_thres);
+
+	if (__predict_false(thres == 0))
+		return;
+
+	if (is_delack_mode_valid(dev, toep)) {
+		dack_mode = TOM_TUNABLE(dev, delack);
+		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
+			u32 r = tp->rcv_nxt - toep->tp_delack_seq;
+
+			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
+				dack = F_RX_DACK_CHANGE |
+				       V_RX_DACK_MODE(dack_mode);
+		}
+	} else 
+		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
+		
+	/*
+	 * For coalescing to work effectively ensure the receive window has
+	 * at least 16KB left.
+	 */
+	must_send = credits + 16384 >= tp->rcv_wnd;
+
+	if (must_send || credits >= thres)
+		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
+}
+
+static int
+cxgb_toe_disconnect(struct tcpcb *tp)
+{
+	struct socket *so;
+	
+	DPRINTF("cxgb_toe_disconnect\n");
+
+	so = inp_inpcbtosocket(tp->t_inpcb);
+	close_conn(so);
+	return (0);
+}
+
+static int
+cxgb_toe_reset(struct tcpcb *tp)
+{
+	struct toepcb *toep = tp->t_toe;
+
+	t3_send_reset(toep);
+
+	/*
+	 * unhook from socket
+	 */
+	tp->t_flags &= ~TF_TOE;
+	toep->tp_tp = NULL;
+	tp->t_toe = NULL;
+	return (0);
+}
+
+static int
+cxgb_toe_send(struct tcpcb *tp)
+{
+	struct socket *so;
+	
+	DPRINTF("cxgb_toe_send\n");
+	dump_toepcb(tp->t_toe);
+
+	so = inp_inpcbtosocket(tp->t_inpcb);
+	t3_push_frames(so, 1);
+	return (0);
+}
+
+static int
+cxgb_toe_rcvd(struct tcpcb *tp)
+{
+
+	inp_lock_assert(tp->t_inpcb);
+
+	t3_cleanup_rbuf(tp, 0);
+	
+	return (0);
+}
+
+static void
+cxgb_toe_detach(struct tcpcb *tp)
+{
+	struct toepcb *toep;
+
+        /*
+	 * XXX how do we handle teardown in the SYN_SENT state?
+	 *
+	 */
+	inp_lock_assert(tp->t_inpcb);
+	toep = tp->t_toe;
+	toep->tp_tp = NULL;
+
+	/*
+	 * unhook from socket
+	 */
+	tp->t_flags &= ~TF_TOE;
+	tp->t_toe = NULL;
+}
+	
+
+static struct toe_usrreqs cxgb_toe_usrreqs = {
+	.tu_disconnect = cxgb_toe_disconnect,
+	.tu_reset = cxgb_toe_reset,
+	.tu_send = cxgb_toe_send,
+	.tu_rcvd = cxgb_toe_rcvd,
+	.tu_detach = cxgb_toe_detach,
+	.tu_detach = cxgb_toe_detach,
+	.tu_syncache_event = handle_syncache_event,
+};
+
+
+static void
+__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
+			    uint64_t mask, uint64_t val, int no_reply)
+{
+	struct cpl_set_tcb_field *req;
+
+	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
+	    toep->tp_tid, word, mask, val);
+
+	req = mtod(m, struct cpl_set_tcb_field *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
+	req->reply = V_NO_REPLY(no_reply);
+	req->cpu_idx = 0;
+	req->word = htons(word);
+	req->mask = htobe64(mask);
+	req->val = htobe64(val);
+
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	send_or_defer(toep, m, 0);
+}
+
+static void
+t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
+{
+	struct mbuf *m;
+	struct tcpcb *tp = toep->tp_tp;
+	
+	if (toep == NULL)
+		return;
+ 
+	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
+		printf("not seting field\n");
+		return;
+	}
+	
+	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
+
+	__set_tcb_field(toep, m, word, mask, val, 1);
+}
+
+/*
+ * Set one of the t_flags bits in the TCB.
+ */
+static void
+set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
+{
+
+	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
+ */
+static void
+t3_set_nagle(struct toepcb *toep)
+{
+	struct tcpcb *tp = toep->tp_tp;
+	
+	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
+ */
+void
+t3_set_keepalive(struct toepcb *toep, int on_off)
+{
+
+	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
+}
+
+void
+t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
+{
+	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
+}
+
+void
+t3_set_dack_mss(struct toepcb *toep, int on_off)
+{
+
+	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
+ */
+static void
+t3_set_tos(struct toepcb *toep)
+{
+	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);	
+	
+	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
+			 V_TCB_TOS(tos));
+}
+
+
+/*
+ * In DDP mode, TP fails to schedule a timer to push RX data to the host when
+ * DDP is disabled (data is delivered to freelist). [Note that, the peer should
+ * set the PSH bit in the last segment, which would trigger delivery.]
+ * We work around the issue by setting a DDP buffer in a partial placed state,
+ * which guarantees that TP will schedule a timer.
+ */
+#define TP_DDP_TIMER_WORKAROUND_MASK\
+    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
+     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
+       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
+#define TP_DDP_TIMER_WORKAROUND_VAL\
+    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
+     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
+      32))
+
+static void
+t3_enable_ddp(struct toepcb *toep, int on)
+{
+	if (on) {
+		
+		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
+				 V_TF_DDP_OFF(0));
+	} else
+		t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
+				 V_TF_DDP_OFF(1) |
+				 TP_DDP_TIMER_WORKAROUND_MASK,
+				 V_TF_DDP_OFF(1) |
+				 TP_DDP_TIMER_WORKAROUND_VAL);
+
+}
+
+void
+t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
+{
+	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
+			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
+			 tag_color);
+}
+
+void
+t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
+		    unsigned int len)
+{
+	if (buf_idx == 0)
+		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
+			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
+			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
+			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
+	else
+		t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
+			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
+			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
+			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
+			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
+}
+
+static int
+t3_set_cong_control(struct socket *so, const char *name)
+{
+#ifdef CONGESTION_CONTROL_SUPPORTED	
+	int cong_algo;
+
+	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
+		if (!strcmp(name, t3_cong_ops[cong_algo].name))
+			break;
+
+	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
+		return -EINVAL;
+#endif
+	return 0;
+}
+
+int
+t3_get_tcb(struct toepcb *toep)
+{
+	struct cpl_get_tcb *req;
+	struct tcpcb *tp = toep->tp_tp;
+	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
+
+	if (!m)
+		return (ENOMEM);
+	
+	inp_lock_assert(tp->t_inpcb);	
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	req = mtod(m, struct cpl_get_tcb *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
+	req->cpuno = htons(toep->tp_qset);
+	req->rsvd = 0;
+	if (tp->t_state == TCPS_SYN_SENT)
+		mbufq_tail(&toep->out_of_order_queue, m);	// defer
+	else
+		cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+	return 0;
+}
+
+static inline void
+so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
+{
+
+	toepcb_hold(toep);
+
+	cxgb_insert_tid(d->cdev, d->client, toep, tid);
+}
+
+/**
+ *	find_best_mtu - find the entry in the MTU table closest to an MTU
+ *	@d: TOM state
+ *	@mtu: the target MTU
+ *
+ *	Returns the index of the value in the MTU table that is closest to but
+ *	does not exceed the target MTU.
+ */
+static unsigned int
+find_best_mtu(const struct t3c_data *d, unsigned short mtu)
+{
+	int i = 0;
+
+	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
+		++i;
+	return (i);
+}
+
+static unsigned int
+select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
+{
+	unsigned int idx;
+	
+#ifdef notyet
+	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
+#endif
+	if (tp) {
+		tp->t_maxseg = pmtu - 40;
+		if (tp->t_maxseg < td->mtus[0] - 40)
+			tp->t_maxseg = td->mtus[0] - 40;
+		idx = find_best_mtu(td, tp->t_maxseg + 40);
+
+		tp->t_maxseg = td->mtus[idx] - 40;
+	} else
+		idx = find_best_mtu(td, pmtu);
+	
+	return (idx);
+}
+
+static inline void
+free_atid(struct t3cdev *cdev, unsigned int tid)
+{
+	struct toepcb *toep = cxgb_free_atid(cdev, tid);
+
+	if (toep)
+		toepcb_release(toep);
+}
+
+/*
+ * Release resources held by an offload connection (TID, L2T entry, etc.)
+ */
+static void
+t3_release_offload_resources(struct toepcb *toep)
+{
+	struct tcpcb *tp = toep->tp_tp;
+	struct toedev *tdev = toep->tp_toedev;
+	struct t3cdev *cdev;
+	struct socket *so;
+	unsigned int tid = toep->tp_tid;
+	struct sockbuf *rcv;
+	
+	CTR0(KTR_TOM, "t3_release_offload_resources");
+
+	if (!tdev)
+		return;
+
+	cdev = TOEP_T3C_DEV(toep);
+	if (!cdev)
+		return;
+
+	toep->tp_qset = 0;
+	t3_release_ddp_resources(toep);
+
+#ifdef CTRL_SKB_CACHE
+	kfree_skb(CTRL_SKB_CACHE(tp));
+	CTRL_SKB_CACHE(tp) = NULL;
+#endif
+
+	if (toep->tp_wr_avail != toep->tp_wr_max) {
+		purge_wr_queue(toep);
+		reset_wr_list(toep);
+	}
+
+	if (toep->tp_l2t) {
+		l2t_release(L2DATA(cdev), toep->tp_l2t);
+		toep->tp_l2t = NULL;
+	}
+	toep->tp_tp = NULL;
+	if (tp) {
+		inp_lock_assert(tp->t_inpcb);
+		so = inp_inpcbtosocket(tp->t_inpcb);
+		rcv = so_sockbuf_rcv(so);		
+		/*
+		 * cancel any offloaded reads
+		 *
+		 */
+		sockbuf_lock(rcv);
+		tp->t_toe = NULL;
+		tp->t_flags &= ~TF_TOE;
+		if (toep->tp_ddp_state.user_ddp_pending) {
+			t3_cancel_ubuf(toep, rcv);
+			toep->tp_ddp_state.user_ddp_pending = 0;
+		}
+		so_sorwakeup_locked(so);
+			
+	}
+	
+	if (toep->tp_state == TCPS_SYN_SENT) {
+		free_atid(cdev, tid);
+#ifdef notyet		
+		__skb_queue_purge(&tp->out_of_order_queue);
+#endif		
+	} else {                                          // we have TID
+		cxgb_remove_tid(cdev, toep, tid);
+		toepcb_release(toep);
+	}
+#if 0
+	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
+#endif
+}
+
+static void
+install_offload_ops(struct socket *so)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+
+	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
+	
+	t3_install_socket_ops(so);
+	tp->t_flags |= TF_TOE;
+	tp->t_tu = &cxgb_toe_usrreqs;
+}
+
+/*
+ * Determine the receive window scaling factor given a target max
+ * receive window.
+ */
+static __inline int
+select_rcv_wscale(int space)
+{
+	int wscale = 0;
+
+	if (space > MAX_RCV_WND)
+		space = MAX_RCV_WND;
+
+	if (tcp_do_rfc1323)
+		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
+
+	return (wscale);
+}
+
+/*
+ * Determine the receive window size for a socket.
+ */
+static unsigned long
+select_rcv_wnd(struct toedev *dev, struct socket *so)
+{
+	struct tom_data *d = TOM_DATA(dev);
+	unsigned int wnd;
+	unsigned int max_rcv_wnd;
+	struct sockbuf *rcv;
+
+	rcv = so_sockbuf_rcv(so);
+	
+	if (tcp_do_autorcvbuf)
+		wnd = tcp_autorcvbuf_max;
+	else
+		wnd = rcv->sb_hiwat;
+
+	
+	
+	/* XXX
+	 * For receive coalescing to work effectively we need a receive window
+	 * that can accomodate a coalesced segment.
+	 */	
+	if (wnd < MIN_RCV_WND)
+		wnd = MIN_RCV_WND; 
+	
+	/* PR 5138 */
+	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 
+				    (uint32_t)d->rx_page_size * 23 :
+				    MAX_RCV_WND);
+	
+	return min(wnd, max_rcv_wnd);
+}
+
+/*
+ * Assign offload parameters to some socket fields.  This code is used by
+ * both active and passive opens.
+ */
+static inline void
+init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
+    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
+	struct sockbuf *snd, *rcv;
+	
+#ifdef notyet	
+	SOCK_LOCK_ASSERT(so);
+#endif
+	
+	snd = so_sockbuf_snd(so);
+	rcv = so_sockbuf_rcv(so);
+	
+	log(LOG_INFO, "initializing offload socket\n");
+	/*
+	 * We either need to fix push frames to work with sbcompress
+	 * or we need to add this
+	 */
+	snd->sb_flags |= SB_NOCOALESCE;
+	rcv->sb_flags |= SB_NOCOALESCE;
+	
+	tp->t_toe = toep;
+	toep->tp_tp = tp;
+	toep->tp_toedev = dev;
+	
+	toep->tp_tid = tid;
+	toep->tp_l2t = e;
+	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
+	toep->tp_wr_unacked = 0;
+	toep->tp_delack_mode = 0;
+	
+	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
+	/*
+	 * XXX broken
+	 * 
+	 */
+	tp->rcv_wnd = select_rcv_wnd(dev, so);
+
+        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
+		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
+	toep->tp_qset_idx = 0;
+	
+	reset_wr_list(toep);
+	DPRINTF("initialization done\n");
+}
+
+/*
+ * The next two functions calculate the option 0 value for a socket.
+ */
+static inline unsigned int
+calc_opt0h(struct socket *so, int mtu_idx)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	int wscale = select_rcv_wscale(tp->rcv_wnd);
+	
+	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
+	    V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
+	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
+}
+
+static inline unsigned int
+calc_opt0l(struct socket *so, int ulp_mode)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	unsigned int val;
+	
+	val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
+	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
+
+	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
+	return (val);
+}
+
+static inline unsigned int
+calc_opt2(const struct socket *so, struct toedev *dev)
+{
+	int flv_valid;
+
+	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
+
+	return (V_FLAVORS_VALID(flv_valid) |
+	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
+}
+
+#if DEBUG_WR > 1
+static int
+count_pending_wrs(const struct toepcb *toep)
+{
+	const struct mbuf *m;
+	int n = 0;
+
+	wr_queue_walk(toep, m)
+		n += m->m_pkthdr.csum_data;
+	return (n);
+}
+#endif
+
+#if 0
+(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
+#endif
+	
+static void
+mk_act_open_req(struct socket *so, struct mbuf *m,
+    unsigned int atid, const struct l2t_entry *e)
+{
+	struct cpl_act_open_req *req;
+	struct inpcb *inp = so_sotoinpcb(so);
+	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
+	struct toepcb *toep = tp->t_toe;
+	struct toedev *tdev = toep->tp_toedev;
+	
+	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
+	
+	req = mtod(m, struct cpl_act_open_req *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = 0;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
+	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
+#if 0	
+	req->local_port = inp->inp_lport;
+	req->peer_port = inp->inp_fport;
+	memcpy(&req->local_ip, &inp->inp_laddr, 4);
+	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
+#endif	
+	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
+			   V_TX_CHANNEL(e->smt_idx));
+	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
+	req->params = 0;
+	req->opt2 = htonl(calc_opt2(so, tdev));
+}
+
+
+/*
+ * Convert an ACT_OPEN_RPL status to an errno.
+ */
+static int
+act_open_rpl_status_to_errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return (ECONNREFUSED);
+	case CPL_ERR_ARP_MISS:
+		return (EHOSTUNREACH);
+	case CPL_ERR_CONN_TIMEDOUT:
+		return (ETIMEDOUT);
+	case CPL_ERR_TCAM_FULL:
+		return (ENOMEM);
+	case CPL_ERR_CONN_EXIST:
+		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
+		return (EADDRINUSE);
+	default:
+		return (EIO);
+	}
+}
+
+static void
+fail_act_open(struct toepcb *toep, int errno)
+{
+	struct tcpcb *tp = toep->tp_tp;
+
+	t3_release_offload_resources(toep);
+	if (tp) {
+		inp_wunlock(tp->t_inpcb);		
+		tcp_offload_drop(tp, errno);
+	}
+	
+#ifdef notyet
+	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+#endif
+}
+
+/*
+ * Handle active open failures.
+ */
+static void
+active_open_failed(struct toepcb *toep, struct mbuf *m)
+{
+	struct cpl_act_open_rpl *rpl = cplhdr(m);
+	struct inpcb *inp;
+
+	if (toep->tp_tp == NULL)
+		goto done;
+
+	inp = toep->tp_tp->t_inpcb;
+
+/*
+ * Don't handle connection retry for now
+ */
+#ifdef notyet
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (rpl->status == CPL_ERR_CONN_EXIST &&
+	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
+		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
+		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
+			       jiffies + HZ / 2);
+	} else
+#endif
+	{
+		inp_wlock(inp);
+		/*
+		 * drops the inpcb lock
+		 */
+		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
+	}
+	
+	done:
+	m_free(m);
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int
+act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+	       status != CPL_ERR_ARP_MISS;
+}
+
+/*
+ * Process an ACT_OPEN_RPL CPL message.
+ */
+static int
+do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct cpl_act_open_rpl *rpl = cplhdr(m);
+	
+	if (cdev->type != T3A && act_open_has_tid(rpl->status))
+		cxgb_queue_tid_release(cdev, GET_TID(rpl));
+	
+	active_open_failed(toep, m);
+	return (0);
+}
+
+/*
+ * Handle an ARP failure for an active open.   XXX purge ofo queue
+ *
+ * XXX badly broken for crossed SYNs as the ATID is no longer valid.
+ * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
+ * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
+ * free the atid.  Hmm.
+ */
+#ifdef notyet
+static void
+act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
+{
+	struct toepcb *toep = m_get_toep(m);
+	struct tcpcb *tp = toep->tp_tp;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so;
+	
+	inp_wlock(inp);
+	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
+		/*
+		 * drops the inpcb lock
+		 */
+		fail_act_open(so, EHOSTUNREACH);
+		printf("freeing %p\n", m);
+		
+		m_free(m);
+	} else
+		inp_wunlock(inp);
+}
+#endif
+/*
+ * Send an active open request.
+ */
+int
+t3_connect(struct toedev *tdev, struct socket *so,
+    struct rtentry *rt, struct sockaddr *nam)
+{
+	struct mbuf *m;
+	struct l2t_entry *e;
+	struct tom_data *d = TOM_DATA(tdev);
+	struct inpcb *inp = so_sotoinpcb(so);
+	struct tcpcb *tp = intotcpcb(inp);
+	struct toepcb *toep; /* allocated by init_offload_socket */
+		
+	int atid;
+
+	toep = toepcb_alloc();
+	if (toep == NULL)
+		goto out_err;
+	
+	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
+		goto out_err;
+	
+	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
+	if (!e)
+		goto free_tid;
+
+	inp_lock_assert(inp);
+	m = m_gethdr(MT_DATA, M_WAITOK);
+	
+#if 0	
+	m->m_toe.mt_toepcb = tp->t_toe;
+	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
+#endif
+	so_lock(so);
+	
+	init_offload_socket(so, tdev, atid, e, rt, toep);
+	
+	install_offload_ops(so);
+	
+	mk_act_open_req(so, m, atid, e);
+	so_unlock(so);
+	
+	soisconnecting(so);
+	toep = tp->t_toe;
+	m_set_toep(m, tp->t_toe);
+	
+	toep->tp_state = TCPS_SYN_SENT;
+	l2t_send(d->cdev, (struct mbuf *)m, e);
+
+	if (toep->tp_ulp_mode)
+		t3_enable_ddp(toep, 0);
+	return 	(0);
+	
+free_tid:
+	printf("failing connect - free atid\n");
+	
+	free_atid(d->cdev, atid);
+out_err:
+	printf("return ENOMEM\n");
+       return (ENOMEM);
+}
+
+/*
+ * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
+ * not send multiple ABORT_REQs for the same connection and also that we do
+ * not try to send a message after the connection has closed.  Returns 1 if
+ * an ABORT_REQ wasn't generated after all, 0 otherwise.
+ */
+static void
+t3_send_reset(struct toepcb *toep)
+{
+	
+	struct cpl_abort_req *req;
+	unsigned int tid = toep->tp_tid;
+	int mode = CPL_ABORT_SEND_RST;
+	struct tcpcb *tp = toep->tp_tp;
+	struct toedev *tdev = toep->tp_toedev;
+	struct socket *so = NULL;
+	struct mbuf *m;
+	struct sockbuf *snd;
+	
+	if (tp) {
+		inp_lock_assert(tp->t_inpcb);
+		so = inp_inpcbtosocket(tp->t_inpcb);
+	}
+	
+	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
+		tdev == NULL))
+		return;
+	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
+
+	snd = so_sockbuf_snd(so);
+	/* Purge the send queue so we don't send anything after an abort. */
+	if (so)
+		sbflush(snd);
+	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
+		mode |= CPL_ABORT_POST_CLOSE_REQ;
+
+	m = m_gethdr_nofail(sizeof(*req));
+	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
+	set_arp_failure_handler(m, abort_arp_failure);
+
+	req = mtod(m, struct cpl_abort_req *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wr_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
+	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
+	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
+	req->cmd = mode;
+	if (tp && (tp->t_state == TCPS_SYN_SENT))
+		mbufq_tail(&toep->out_of_order_queue, m);	// defer
+	else
+		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
+}
+
+static int
+t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	struct inpcb *inp;
+	int error, optval;
+	
+	if (sopt->sopt_name == IP_OPTIONS)
+		return (ENOPROTOOPT);
+
+	if (sopt->sopt_name != IP_TOS)
+		return (EOPNOTSUPP);
+	
+	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
+
+	if (error)
+		return (error);
+
+	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
+		return (EPERM);
+
+	inp = so_sotoinpcb(so);
+	inp_wlock(inp);
+	inp_ip_tos_set(inp, optval);
+#if 0	
+	inp->inp_ip_tos = optval;
+#endif
+	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
+	inp_wunlock(inp);
+
+	return (0);
+}
+
+static int
+t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	int err = 0;
+	size_t copied;
+
+	if (sopt->sopt_name != TCP_CONGESTION &&
+	    sopt->sopt_name != TCP_NODELAY)
+		return (EOPNOTSUPP);
+
+	if (sopt->sopt_name == TCP_CONGESTION) {
+		char name[TCP_CA_NAME_MAX];
+		int optlen = sopt->sopt_valsize;
+		struct tcpcb *tp;
+		
+		if (sopt->sopt_dir == SOPT_GET) {
+			KASSERT(0, ("unimplemented"));
+			return (EOPNOTSUPP);
+		}
+
+		if (optlen < 1)
+			return (EINVAL);
+		
+		err = copyinstr(sopt->sopt_val, name, 
+		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
+		if (err)
+			return (err);
+		if (copied < 1)
+			return (EINVAL);
+
+		tp = so_sototcpcb(so);
+		/*
+		 * XXX I need to revisit this
+		 */
+		if ((err = t3_set_cong_control(so, name)) == 0) {
+#ifdef CONGESTION_CONTROL_SUPPORTED
+			tp->t_cong_control = strdup(name, M_CXGB);
+#endif			
+		} else
+			return (err);
+	} else {
+		int optval, oldval;
+		struct inpcb *inp;
+		struct tcpcb *tp;
+
+		if (sopt->sopt_dir == SOPT_GET)
+			return (EOPNOTSUPP);
+	
+		err = sooptcopyin(sopt, &optval, sizeof optval,
+		    sizeof optval);
+
+		if (err)
+			return (err);
+
+		inp = so_sotoinpcb(so);
+		tp = inp_inpcbtotcpcb(inp);
+		    
+		inp_wlock(inp);
+		
+		oldval = tp->t_flags;
+		if (optval)
+			tp->t_flags |= TF_NODELAY;
+		else
+			tp->t_flags &= ~TF_NODELAY;
+		inp_wunlock(inp);
+
+
+		if (oldval != tp->t_flags && (tp->t_toe != NULL))
+			t3_set_nagle(tp->t_toe);
+
+	}
+
+	return (0);
+}
+
+int
+t3_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	int err;
+
+	if (sopt->sopt_level != IPPROTO_TCP) 
+		err =  t3_ip_ctloutput(so, sopt);
+	else
+		err = t3_tcp_ctloutput(so, sopt);
+
+	if (err != EOPNOTSUPP)
+		return (err);
+
+	return (tcp_ctloutput(so, sopt));
+}
+
+/*
+ * Returns true if we need to explicitly request RST when we receive new data
+ * on an RX-closed connection.
+ */
+static inline int
+need_rst_on_excess_rx(const struct toepcb *toep)
+{
+	return (1);
+}
+
+/*
+ * Handles Rx data that arrives in a state where the socket isn't accepting
+ * new data.
+ */
+static void
+handle_excess_rx(struct toepcb *toep, struct mbuf *m)
+{
+	
+	if (need_rst_on_excess_rx(toep) &&
+	    !(toep->tp_flags & TP_ABORT_SHUTDOWN))
+		t3_send_reset(toep);
+	m_freem(m); 
+}
+
+/*
+ * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
+ * by getting the DDP offset from the TCB.
+ */
+static void
+tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
+{
+	struct ddp_state *q = &toep->tp_ddp_state;
+	struct ddp_buf_state *bsp;
+	struct cpl_get_tcb_rpl *hdr;
+	unsigned int ddp_offset;
+	struct socket *so;
+	struct tcpcb *tp;
+	struct sockbuf *rcv;	
+	int state;
+	
+	uint64_t t;
+	__be64 *tcb;
+
+	tp = toep->tp_tp;
+	so = inp_inpcbtosocket(tp->t_inpcb);
+
+	inp_lock_assert(tp->t_inpcb);
+	rcv = so_sockbuf_rcv(so);
+	sockbuf_lock(rcv);	
+	
+	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
+	 * We really need a cookie in order to dispatch the RPLs.
+	 */
+	q->get_tcb_count--;
+
+	/* It is a possible that a previous CPL already invalidated UBUF DDP
+	 * and moved the cur_buf idx and hence no further processing of this
+	 * skb is required. However, the app might be sleeping on
+	 * !q->get_tcb_count and we need to wake it up.
+	 */
+	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
+		int state = so_state_get(so);
+
+		m_freem(m);
+		if (__predict_true((state & SS_NOFDREF) == 0))
+			so_sorwakeup_locked(so);
+		else
+			sockbuf_unlock(rcv);
+
+		return;
+	}
+
+	bsp = &q->buf_state[q->cur_buf];
+	hdr = cplhdr(m);
+	tcb = (__be64 *)(hdr + 1);
+	if (q->cur_buf == 0) {
+		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
+		ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
+	} else {
+		t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
+		ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
+	}
+	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
+	m->m_cur_offset = bsp->cur_offset;
+	bsp->cur_offset = ddp_offset;
+	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
+
+	CTR5(KTR_TOM,
+	    "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
+	    q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
+	KASSERT(ddp_offset >= m->m_cur_offset,
+	    ("ddp_offset=%u less than cur_offset=%u",
+		ddp_offset, m->m_cur_offset));
+	
+#if 0
+{
+	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
+
+	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
+	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
+
+        t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
+        rcv_nxt = t >> S_TCB_RCV_NXT;
+        rcv_nxt &= M_TCB_RCV_NXT;
+
+        t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
+        rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
+        rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
+
+	T3_TRACE2(TIDTB(sk),
+		  "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
+		  ddp_flags, rcv_nxt - rx_hdr_offset);
+	T3_TRACE4(TB(q),
+		  "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
+		  tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
+	T3_TRACE3(TB(q),
+		  "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
+		  rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
+	T3_TRACE2(TB(q),
+		  "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
+		 q->buf_state[0].flags, q->buf_state[1].flags);
+
+}
+#endif
+	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
+		handle_excess_rx(toep, m);
+		return;
+	}
+
+#ifdef T3_TRACE
+	if ((int)m->m_pkthdr.len < 0) {
+		t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
+	}
+#endif
+	if (bsp->flags & DDP_BF_NOCOPY) {
+#ifdef T3_TRACE
+		T3_TRACE0(TB(q),
+			  "tcb_rpl_as_ddp_complete: CANCEL UBUF");
+
+		if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+			printk("!cancel_ubuf");
+			t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
+		}
+#endif
+		m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
+		bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
+		q->cur_buf ^= 1;
+	} else if (bsp->flags & DDP_BF_NOFLIP) {
+
+		m->m_ddp_flags = 1;    /* always a kernel buffer */
+
+		/* now HW buffer carries a user buffer */
+		bsp->flags &= ~DDP_BF_NOFLIP;
+		bsp->flags |= DDP_BF_NOCOPY;
+
+		/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
+		 * any new data in which case we're done. If in addition the
+		 * offset is 0, then there wasn't a completion for the kbuf
+		 * and we need to decrement the posted count.
+		 */
+		if (m->m_pkthdr.len == 0) {
+			if (ddp_offset == 0) {
+				q->kbuf_posted--;
+				bsp->flags |= DDP_BF_NODATA;
+			}
+			sockbuf_unlock(rcv);
+			m_free(m);
+			return;
+		}
+	} else {
+		sockbuf_unlock(rcv);
+
+		/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
+		 * but it got here way late and nobody cares anymore.
+		 */
+		m_free(m);
+		return;
+	}
+
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
+	m->m_seq = tp->rcv_nxt;
+	tp->rcv_nxt += m->m_pkthdr.len;
+	tp->t_rcvtime = ticks;
+	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
+		  m->m_seq, q->cur_buf, m->m_pkthdr.len);
+	if (m->m_pkthdr.len == 0) {
+		q->user_ddp_pending = 0;
+		m_free(m);
+	} else 
+		SBAPPEND(rcv, m);
+
+	state = so_state_get(so);	
+	if (__predict_true((state & SS_NOFDREF) == 0))
+		so_sorwakeup_locked(so);
+	else
+		sockbuf_unlock(rcv);
+}
+
+/*
+ * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
+ * in that case they are similar to DDP completions.
+ */
+static int
+do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	/* OK if socket doesn't exist */
+	if (toep == NULL) {
+		printf("null toep in do_get_tcb_rpl\n");
+		return (CPL_RET_BUF_DONE);
+	}
+
+	inp_wlock(toep->tp_tp->t_inpcb);
+	tcb_rpl_as_ddp_complete(toep, m);
+	inp_wunlock(toep->tp_tp->t_inpcb);
+	
+	return (0);
+}
+
+static void
+handle_ddp_data(struct toepcb *toep, struct mbuf *m)
+{
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so;
+	struct ddp_state *q;
+	struct ddp_buf_state *bsp;
+	struct cpl_rx_data *hdr = cplhdr(m);
+	unsigned int rcv_nxt = ntohl(hdr->seq);
+	struct sockbuf *rcv;	
+	
+	if (tp->rcv_nxt == rcv_nxt)
+		return;
+
+	inp_lock_assert(tp->t_inpcb);
+	so  = inp_inpcbtosocket(tp->t_inpcb);
+	rcv = so_sockbuf_rcv(so);	
+	sockbuf_lock(rcv);	
+
+	q = &toep->tp_ddp_state;
+	bsp = &q->buf_state[q->cur_buf];
+	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
+		rcv_nxt, tp->rcv_nxt));
+	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
+	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
+	    rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
+
+#ifdef T3_TRACE
+	if ((int)m->m_pkthdr.len < 0) {
+		t3_ddp_error(so, "handle_ddp_data: neg len");
+	}
+#endif
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
+	m->m_cur_offset = bsp->cur_offset;
+	m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
+	if (bsp->flags & DDP_BF_NOCOPY)
+		bsp->flags &= ~DDP_BF_NOCOPY;
+
+	m->m_seq = tp->rcv_nxt;
+	tp->rcv_nxt = rcv_nxt;
+	bsp->cur_offset += m->m_pkthdr.len;
+	if (!(bsp->flags & DDP_BF_NOFLIP))
+		q->cur_buf ^= 1;
+	/*
+	 * For now, don't re-enable DDP after a connection fell out of  DDP
+	 * mode.
+	 */
+	q->ubuf_ddp_ready = 0;
+	sockbuf_unlock(rcv);
+}
+
+/*
+ * Process new data received for a connection.
+ */
+static void
+new_rx_data(struct toepcb *toep, struct mbuf *m)
+{
+	struct cpl_rx_data *hdr = cplhdr(m);
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so;
+	struct sockbuf *rcv;	
+	int state;
+	int len = be16toh(hdr->len);
+
+	inp_wlock(tp->t_inpcb);
+
+	so  = inp_inpcbtosocket(tp->t_inpcb);
+	
+	if (__predict_false(so_no_receive(so))) {
+		handle_excess_rx(toep, m);
+		inp_wunlock(tp->t_inpcb);
+		TRACE_EXIT;
+		return;
+	}
+
+	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
+		handle_ddp_data(toep, m);
+	
+	m->m_seq = ntohl(hdr->seq);
+	m->m_ulp_mode = 0;                    /* for iSCSI */
+
+#if VALIDATE_SEQ
+	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
+		log(LOG_ERR,
+		       "%s: TID %u: Bad sequence number %u, expected %u\n",
+		    toep->tp_toedev->name, toep->tp_tid, m->m_seq,
+		       tp->rcv_nxt);
+		m_freem(m);
+		inp_wunlock(tp->t_inpcb);
+		return;
+	}
+#endif
+	m_adj(m, sizeof(*hdr));
+
+#ifdef URGENT_DATA_SUPPORTED
+	/*
+	 * We don't handle urgent data yet
+	 */
+	if (__predict_false(hdr->urg))
+		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
+	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
+		     tp->urg_seq - tp->rcv_nxt < skb->len))
+		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
+							 tp->rcv_nxt];
+#endif	
+	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
+		toep->tp_delack_mode = hdr->dack_mode;
+		toep->tp_delack_seq = tp->rcv_nxt;
+	}
+	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
+	    m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
+	
+	if (len < m->m_pkthdr.len)
+		m->m_pkthdr.len = m->m_len = len;
+
+	tp->rcv_nxt += m->m_pkthdr.len;
+	tp->t_rcvtime = ticks;
+	toep->tp_enqueued_bytes += m->m_pkthdr.len;
+	CTR2(KTR_TOM,
+	    "new_rx_data: seq 0x%x len %u",
+	    m->m_seq, m->m_pkthdr.len);
+	inp_wunlock(tp->t_inpcb);
+	rcv = so_sockbuf_rcv(so);
+	sockbuf_lock(rcv);
+#if 0	
+	if (sb_notify(rcv))
+		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
+#endif
+	SBAPPEND(rcv, m);
+
+#ifdef notyet
+	/*
+	 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
+	 *
+	 */
+	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
+
+	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
+		so, rcv->sb_cc, rcv->sb_mbmax));
+#endif
+	
+
+	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
+	    rcv->sb_cc, rcv->sb_mbcnt);
+	
+	state = so_state_get(so);	
+	if (__predict_true((state & SS_NOFDREF) == 0))
+		so_sorwakeup_locked(so);
+	else
+		sockbuf_unlock(rcv);
+}
+
+/*
+ * Handler for RX_DATA CPL messages.
+ */
+static int
+do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
+	
+	new_rx_data(toep, m);
+
+	return (0);
+}
+
+static void
+new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
+{
+	struct tcpcb *tp;
+	struct ddp_state *q;
+	struct ddp_buf_state *bsp;
+	struct cpl_rx_data_ddp *hdr;
+	struct socket *so;	
+	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
+	int nomoredata = 0;
+	unsigned int delack_mode;
+	struct sockbuf *rcv;
+	
+	tp = toep->tp_tp;	
+	inp_wlock(tp->t_inpcb);
+	so = inp_inpcbtosocket(tp->t_inpcb);
+
+	if (__predict_false(so_no_receive(so))) {
+
+		handle_excess_rx(toep, m);
+		inp_wunlock(tp->t_inpcb);
+		return;
+	}
+	
+	q = &toep->tp_ddp_state;
+	hdr = cplhdr(m);
+	ddp_report = ntohl(hdr->u.ddp_report);
+	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
+	bsp = &q->buf_state[buf_idx];
+
+	CTR4(KTR_TOM,
+	    "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
+	    "hdr seq 0x%x len %u",
+	    tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
+	    ntohs(hdr->len));
+	CTR3(KTR_TOM,
+	    "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
+	    G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
+	
+	ddp_len = ntohs(hdr->len);
+	rcv_nxt = ntohl(hdr->seq) + ddp_len;
+
+	delack_mode = G_DDP_DACK_MODE(ddp_report);
+	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
+		toep->tp_delack_mode = delack_mode;
+		toep->tp_delack_seq = tp->rcv_nxt;
+	}
+	
+	m->m_seq = tp->rcv_nxt;
+	tp->rcv_nxt = rcv_nxt;
+
+	tp->t_rcvtime = ticks;
+	/*
+	 * Store the length in m->m_len.  We are changing the meaning of
+	 * m->m_len here, we need to be very careful that nothing from now on
+	 * interprets ->len of this packet the usual way.
+	 */
+	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
+	inp_wunlock(tp->t_inpcb);
+	CTR3(KTR_TOM,
+	    "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
+	    m->m_len, rcv_nxt, m->m_seq);
+	/*
+	 * Figure out where the new data was placed in the buffer and store it
+	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
+	 * account for page pod's pg_offset.
+	 */
+	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
+	m->m_cur_offset = end_offset - m->m_pkthdr.len;
+
+	rcv = so_sockbuf_rcv(so);
+	sockbuf_lock(rcv);	
+
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
+	bsp->cur_offset = end_offset;
+	toep->tp_enqueued_bytes += m->m_pkthdr.len;
+
+	/*
+	 * Length is only meaningful for kbuf
+	 */
+	if (!(bsp->flags & DDP_BF_NOCOPY))
+		KASSERT(m->m_len <= bsp->gl->dgl_length,
+		    ("length received exceeds ddp pages: len=%d dgl_length=%d",
+			m->m_len, bsp->gl->dgl_length));
+
+	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
+        /*
+	 * Bit 0 of flags stores whether the DDP buffer is completed.
+	 * Note that other parts of the code depend on this being in bit 0.
+	 */
+	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
+		panic("spurious ddp completion");
+	} else {
+		m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
+		if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 
+			q->cur_buf ^= 1;                     /* flip buffers */
+	}
+
+	if (bsp->flags & DDP_BF_NOCOPY) {
+		m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
+		bsp->flags &= ~DDP_BF_NOCOPY;
+	}
+
+	if (ddp_report & F_DDP_PSH)
+		m->m_ddp_flags |= DDP_BF_PSH;
+	if (nomoredata)
+		m->m_ddp_flags |= DDP_BF_NODATA;
+
+#ifdef notyet	
+	skb_reset_transport_header(skb);
+	tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
+#endif
+	SBAPPEND(rcv, m);
+
+	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
+	    (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
+		|| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
+		so_sorwakeup_locked(so);
+	else
+		sockbuf_unlock(rcv);
+}
+
+#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
+		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
+		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
+		 F_DDP_INVALID_PPOD)
+
+/*
+ * Handler for RX_DATA_DDP CPL messages.
+ */
+static int
+do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = ctx;
+	const struct cpl_rx_data_ddp *hdr = cplhdr(m);
+
+	VALIDATE_SOCK(so);
+
+	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
+		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
+		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
+		return (CPL_RET_BUF_DONE);
+	}
+#if 0
+	skb->h.th = tcphdr_skb->h.th;
+#endif	
+	new_rx_data_ddp(toep, m);
+	return (0);
+}
+
+static void
+process_ddp_complete(struct toepcb *toep, struct mbuf *m)
+{
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so;
+	struct ddp_state *q;
+	struct ddp_buf_state *bsp;
+	struct cpl_rx_ddp_complete *hdr;
+	unsigned int ddp_report, buf_idx, when, delack_mode;
+	int nomoredata = 0;
+	struct sockbuf *rcv;
+	
+	inp_wlock(tp->t_inpcb);
+	so = inp_inpcbtosocket(tp->t_inpcb);
+
+	if (__predict_false(so_no_receive(so))) {
+		struct inpcb *inp = so_sotoinpcb(so);
+
+		handle_excess_rx(toep, m);
+		inp_wunlock(inp);
+		return;
+	}
+	q = &toep->tp_ddp_state; 
+	hdr = cplhdr(m);
+	ddp_report = ntohl(hdr->ddp_report);
+	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
+	m->m_pkthdr.csum_data = tp->rcv_nxt;
+
+	rcv = so_sockbuf_rcv(so);
+	sockbuf_lock(rcv);
+
+	bsp = &q->buf_state[buf_idx];
+	when = bsp->cur_offset;
+	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
+	tp->rcv_nxt += m->m_len;
+	tp->t_rcvtime = ticks;
+
+	delack_mode = G_DDP_DACK_MODE(ddp_report);
+	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
+		toep->tp_delack_mode = delack_mode;
+		toep->tp_delack_seq = tp->rcv_nxt;
+	}
+#ifdef notyet
+	skb_reset_transport_header(skb);
+	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
+#endif
+	inp_wunlock(tp->t_inpcb);
+
+	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+	CTR5(KTR_TOM,
+		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+		  "ddp_report 0x%x offset %u, len %u",
+		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
+		   G_DDP_OFFSET(ddp_report), m->m_len);
+
+	m->m_cur_offset = bsp->cur_offset;
+	bsp->cur_offset += m->m_len;
+
+	if (!(bsp->flags & DDP_BF_NOFLIP)) {
+		q->cur_buf ^= 1;                     /* flip buffers */
+		if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
+			nomoredata=1;
+	}
+		
+	CTR4(KTR_TOM,
+		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
+		  "ddp_report %u offset %u",
+		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
+		   G_DDP_OFFSET(ddp_report));
+	
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
+	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
+	if (bsp->flags & DDP_BF_NOCOPY)
+		bsp->flags &= ~DDP_BF_NOCOPY;
+	if (nomoredata)
+		m->m_ddp_flags |= DDP_BF_NODATA;
+
+	SBAPPEND(rcv, m);
+	if ((so_state_get(so) & SS_NOFDREF) == 0)
+		so_sorwakeup_locked(so);
+	else
+		sockbuf_unlock(rcv);
+}
+
+/*
+ * Handler for RX_DDP_COMPLETE CPL messages.
+ */
+static int
+do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = ctx;
+
+	VALIDATE_SOCK(so);
+#if 0
+	skb->h.th = tcphdr_skb->h.th;
+#endif	
+	process_ddp_complete(toep, m);
+	return (0);
+}
+
+/*
+ * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
+ * socket state before calling tcp_time_wait to comply with its expectations.
+ */
+static void
+enter_timewait(struct tcpcb *tp)
+{
+	/*
+	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
+	 * process peer_close because we don't want to carry the peer FIN in
+	 * the socket's receive queue and if we increment rcv_nxt without
+	 * having the FIN in the receive queue we'll confuse facilities such
+	 * as SIOCINQ.
+	 */
+	inp_wlock(tp->t_inpcb);	
+	tp->rcv_nxt++;
+
+	tp->ts_recent_age = 0;	     /* defeat recycling */
+	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
+	inp_wunlock(tp->t_inpcb);
+	tcp_offload_twstart(tp);
+}
+
+/*
+ * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
+ * function deals with the data that may be reported along with the FIN.
+ * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
+ * perform normal FIN-related processing.  In the latter case 1 indicates that
+ * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
+ * skb can be freed.
+ */
+static int
+handle_peer_close_data(struct socket *so, struct mbuf *m)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct ddp_state *q;
+	struct ddp_buf_state *bsp;
+	struct cpl_peer_close *req = cplhdr(m);
+	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
+	struct sockbuf *rcv;
+	
+	if (tp->rcv_nxt == rcv_nxt)			/* no data */
+		return (0);
+
+	CTR0(KTR_TOM, "handle_peer_close_data");
+	if (__predict_false(so_no_receive(so))) {
+		handle_excess_rx(toep, m);
+
+		/*
+		 * Although we discard the data we want to process the FIN so
+		 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
+		 * PEER_CLOSE without data.  In particular this PEER_CLOSE
+		 * may be what will close the connection.  We return 1 because
+		 * handle_excess_rx() already freed the packet.
+		 */
+		return (1);
+	}
+
+	inp_lock_assert(tp->t_inpcb);
+	q = &toep->tp_ddp_state;
+	rcv = so_sockbuf_rcv(so);
+	sockbuf_lock(rcv);
+
+	bsp = &q->buf_state[q->cur_buf];
+	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
+	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
+	m->m_ddp_gl = (unsigned char *)bsp->gl;
+	m->m_flags |= M_DDP;
+	m->m_cur_offset = bsp->cur_offset;
+	m->m_ddp_flags = 
+	    DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
+	m->m_seq = tp->rcv_nxt;
+	tp->rcv_nxt = rcv_nxt;
+	bsp->cur_offset += m->m_pkthdr.len;
+	if (!(bsp->flags & DDP_BF_NOFLIP))
+		q->cur_buf ^= 1;
+#ifdef notyet	
+	skb_reset_transport_header(skb);
+	tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
+#endif	
+	tp->t_rcvtime = ticks;
+	SBAPPEND(rcv, m);
+	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
+		so_sorwakeup_locked(so);
+	else
+		sockbuf_unlock(rcv);
+
+	return (1);
+}
+
+/*
+ * Handle a peer FIN.
+ */
+static void
+do_peer_fin(struct toepcb *toep, struct mbuf *m)
+{
+	struct socket *so;
+	struct tcpcb *tp = toep->tp_tp;
+	int keep, action;
+	
+	action = keep = 0;	
+	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
+	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
+		printf("abort_pending set\n");
+		
+		goto out;
+	}
+	inp_wlock(tp->t_inpcb);
+	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
+	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
+		keep = handle_peer_close_data(so, m);
+		if (keep < 0) {
+			inp_wunlock(tp->t_inpcb);					
+			return;
+		}
+	}
+	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
+		CTR1(KTR_TOM,
+		    "waking up waiters for cantrcvmore on %p ", so);	
+		socantrcvmore(so);
+
+		/*
+		 * If connection is half-synchronized
+		 * (ie NEEDSYN flag on) then delay ACK,
+		 * so it may be piggybacked when SYN is sent.
+		 * Otherwise, since we received a FIN then no
+		 * more input can be expected, send ACK now.
+		 */
+		if (tp->t_flags & TF_NEEDSYN)
+			tp->t_flags |= TF_DELACK;
+		else
+			tp->t_flags |= TF_ACKNOW;
+		tp->rcv_nxt++;
+	}
+	
+	switch (tp->t_state) {
+	case TCPS_SYN_RECEIVED:
+	    tp->t_starttime = ticks;
+	/* FALLTHROUGH */ 
+	case TCPS_ESTABLISHED:
+		tp->t_state = TCPS_CLOSE_WAIT;
+		break;
+	case TCPS_FIN_WAIT_1:
+		tp->t_state = TCPS_CLOSING;
+		break;
+	case TCPS_FIN_WAIT_2:
+		/*
+		 * If we've sent an abort_req we must have sent it too late,
+		 * HW will send us a reply telling us so, and this peer_close
+		 * is really the last message for this connection and needs to
+		 * be treated as an abort_rpl, i.e., transition the connection
+		 * to TCP_CLOSE (note that the host stack does this at the
+		 * time of generating the RST but we must wait for HW).
+		 * Otherwise we enter TIME_WAIT.
+		 */
+		t3_release_offload_resources(toep);
+		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+			action = TCP_CLOSE;
+		} else {
+			action = TCP_TIMEWAIT;			
+		}
+		break;
+	default:
+		log(LOG_ERR,
+		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
+		    toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
+	}
+	inp_wunlock(tp->t_inpcb);					
+
+	if (action == TCP_TIMEWAIT) {
+		enter_timewait(tp);
+	} else if (action == TCP_DROP) {
+		tcp_offload_drop(tp, 0);		
+	} else if (action == TCP_CLOSE) {
+		tcp_offload_close(tp);		
+	}
+
+#ifdef notyet		
+	/* Do not send POLL_HUP for half duplex close. */
+	if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
+	    sk->sk_state == TCP_CLOSE)
+		sk_wake_async(so, 1, POLL_HUP);
+	else
+		sk_wake_async(so, 1, POLL_IN);
+#endif
+
+out:
+	if (!keep)
+		m_free(m);
+}
+
+/*
+ * Handler for PEER_CLOSE CPL messages.
+ */
+static int
+do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	VALIDATE_SOCK(so);
+
+	do_peer_fin(toep, m);
+	return (0);
+}
+
+static void
+process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
+{
+	struct cpl_close_con_rpl *rpl = cplhdr(m);
+	struct tcpcb *tp = toep->tp_tp;	
+	struct socket *so;	
+	int action = 0;
+	struct sockbuf *rcv;	
+	
+	inp_wlock(tp->t_inpcb);
+	so = inp_inpcbtosocket(tp->t_inpcb);	
+	
+	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
+
+	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
+		inp_wunlock(tp->t_inpcb);
+		goto out;
+	}
+	
+	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 
+	    tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
+
+	switch (tp->t_state) {
+	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
+		t3_release_offload_resources(toep);
+		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+			action = TCP_CLOSE;
+
+		} else {
+			action = TCP_TIMEWAIT;
+		}
+		break;
+	case TCPS_LAST_ACK:
+		/*
+		 * In this state we don't care about pending abort_rpl.
+		 * If we've sent abort_req it was post-close and was sent too
+		 * late, this close_con_rpl is the actual last message.
+		 */
+		t3_release_offload_resources(toep);
+		action = TCP_CLOSE;
+		break;
+	case TCPS_FIN_WAIT_1:
+		/*
+		 * If we can't receive any more
+		 * data, then closing user can proceed.
+		 * Starting the timer is contrary to the
+		 * specification, but if we don't get a FIN
+		 * we'll hang forever.
+		 *
+		 * XXXjl:
+		 * we should release the tp also, and use a
+		 * compressed state.
+		 */
+		if (so)
+			rcv = so_sockbuf_rcv(so);
+		else
+			break;
+		
+		if (rcv->sb_state & SBS_CANTRCVMORE) {
+			int timeout;
+
+			if (so)
+				soisdisconnected(so);
+			timeout = (tcp_fast_finwait2_recycle) ? 
+			    tcp_finwait2_timeout : tcp_maxidle;
+			tcp_timer_activate(tp, TT_2MSL, timeout);
+		}
+		tp->t_state = TCPS_FIN_WAIT_2;
+		if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
+		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
+			action = TCP_DROP;
+		}
+
+		break;
+	default:
+		log(LOG_ERR,
+		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
+		       toep->tp_toedev->tod_name, toep->tp_tid,
+		       tp->t_state);
+	}
+	inp_wunlock(tp->t_inpcb);
+
+
+	if (action == TCP_TIMEWAIT) {
+		enter_timewait(tp);
+	} else if (action == TCP_DROP) {
+		tcp_offload_drop(tp, 0);		
+	} else if (action == TCP_CLOSE) {
+		tcp_offload_close(tp);		
+	}
+out:
+	m_freem(m);
+}
+
+/*
+ * Handler for CLOSE_CON_RPL CPL messages.
+ */
+static int
+do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
+			    void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	process_close_con_rpl(toep, m);
+	return (0);
+}
+
+/*
+ * Process abort replies.  We only process these messages if we anticipate
+ * them as the coordination between SW and HW in this area is somewhat lacking
+ * and sometimes we get ABORT_RPLs after we are done with the connection that
+ * originated the ABORT_REQ.
+ */
+static void
+process_abort_rpl(struct toepcb *toep, struct mbuf *m)
+{
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so;	
+	int needclose = 0;
+	
+#ifdef T3_TRACE
+	T3_TRACE1(TIDTB(sk),
+		  "process_abort_rpl: GTS rpl pending %d",
+		  sock_flag(sk, ABORT_RPL_PENDING));
+#endif
+	
+	inp_wlock(tp->t_inpcb);
+	so = inp_inpcbtosocket(tp->t_inpcb);
+	
+	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+		/*
+		 * XXX panic on tcpdrop
+		 */
+		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
+			toep->tp_flags |= TP_ABORT_RPL_RCVD;
+		else {
+			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
+			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
+			    !is_t3a(toep->tp_toedev)) {
+				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
+					panic("TP_ABORT_REQ_RCVD set");
+				t3_release_offload_resources(toep);
+				needclose = 1;
+			}
+		}
+	}
+	inp_wunlock(tp->t_inpcb);
+
+	if (needclose)
+		tcp_offload_close(tp);
+
+	m_free(m);
+}
+
+/*
+ * Handle an ABORT_RPL_RSS CPL message.
+ */
+static int
+do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
+	struct toepcb *toep;
+	
+	/*
+	 * Ignore replies to post-close aborts indicating that the abort was
+	 * requested too late.  These connections are terminated when we get
+	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
+	 * arrives the TID is either no longer used or it has been recycled.
+	 */
+	if (rpl->status == CPL_ERR_ABORT_FAILED) {
+discard:
+		m_free(m);
+		return (0);
+	}
+
+	toep = (struct toepcb *)ctx;
+	
+        /*
+	 * Sometimes we've already closed the socket, e.g., a post-close
+	 * abort races with ABORT_REQ_RSS, the latter frees the socket
+	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
+	 * but FW turns the ABORT_REQ into a regular one and so we get
+	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
+	 */
+	if (!toep)
+		goto discard;
+
+	if (toep->tp_tp == NULL) {
+		log(LOG_NOTICE, "removing tid for abort\n");
+		cxgb_remove_tid(cdev, toep, toep->tp_tid);
+		if (toep->tp_l2t) 
+			l2t_release(L2DATA(cdev), toep->tp_l2t);
+
+		toepcb_release(toep);
+		goto discard;
+	}
+	
+	log(LOG_NOTICE, "toep=%p\n", toep);
+	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
+
+	toepcb_hold(toep);
+	process_abort_rpl(toep, m);
+	toepcb_release(toep);
+	return (0);
+}
+
+/*
+ * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
+ * indicate whether RST should be sent in response.
+ */
+static int
+abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN:
+#if 0		
+		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
+#endif		
+	case CPL_ERR_CONN_RESET:
+		// XXX need to handle SYN_RECV due to crossed SYNs
+		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+#if 0		
+		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
+#endif		
+		return (ETIMEDOUT);
+	default:
+		return (EIO);
+	}
+}
+
+static inline void
+set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
+{
+	struct cpl_abort_rpl *rpl = cplhdr(m);
+
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
+	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
+	
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
+	rpl->cmd = cmd;
+}
+
+static void
+send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
+{
+	struct mbuf *reply_mbuf;
+	struct cpl_abort_req_rss *req = cplhdr(m);
+
+	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
+	m_set_priority(m, CPL_PRIORITY_DATA);
+	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
+	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
+	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
+	m_free(m);
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static inline int
+is_neg_adv_abort(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+	    status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static void
+send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
+{
+	struct mbuf  *reply_mbuf;
+	struct cpl_abort_req_rss *req = cplhdr(m);
+
+	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+
+	if (!reply_mbuf) {
+		/* Defer the reply.  Stick rst_status into req->cmd. */
+		req->status = rst_status;
+		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
+		return;
+	}
+
+	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
+	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
+	m_free(m);
+
+	/*
+	 * XXX need to sync with ARP as for SYN_RECV connections we can send
+	 * these messages while ARP is pending.  For other connection states
+	 * it's not a problem.
+	 */
+	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
+}
+
+#ifdef notyet
+static void
+cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
+{
+	CXGB_UNIMPLEMENTED();
+#ifdef notyet	
+	struct request_sock *req = child->sk_user_data;
+
+	inet_csk_reqsk_queue_removed(parent, req);
+	synq_remove(tcp_sk(child));
+	__reqsk_free(req);
+	child->sk_user_data = NULL;
+#endif
+}
+
+
+/*
+ * Performs the actual work to abort a SYN_RECV connection.
+ */
+static void
+do_abort_syn_rcv(struct socket *child, struct socket *parent)
+{
+	struct tcpcb *parenttp = so_sototcpcb(parent);
+	struct tcpcb *childtp = so_sototcpcb(child);
+
+	/*
+	 * If the server is still open we clean up the child connection,
+	 * otherwise the server already did the clean up as it was purging
+	 * its SYN queue and the skb was just sitting in its backlog.
+	 */
+	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
+		cleanup_syn_rcv_conn(child, parent);
+		inp_wlock(childtp->t_inpcb);
+		t3_release_offload_resources(childtp->t_toe);
+		inp_wunlock(childtp->t_inpcb);
+		tcp_offload_close(childtp);
+	}
+}
+#endif
+
+/*
+ * Handle abort requests for a SYN_RECV connection.  These need extra work
+ * because the socket is on its parent's SYN queue.
+ */
+static int
+abort_syn_rcv(struct socket *so, struct mbuf *m)
+{
+	CXGB_UNIMPLEMENTED();
+#ifdef notyet	
+	struct socket *parent;
+	struct toedev *tdev = toep->tp_toedev;
+	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
+	struct socket *oreq = so->so_incomp;
+	struct t3c_tid_entry *t3c_stid;
+	struct tid_info *t;
+
+	if (!oreq)
+		return -1;        /* somehow we are not on the SYN queue */
+
+	t = &(T3C_DATA(cdev))->tid_maps;
+	t3c_stid = lookup_stid(t, oreq->ts_recent);
+	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
+
+	so_lock(parent);
+	do_abort_syn_rcv(so, parent);
+	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
+	so_unlock(parent);
+#endif
+	return (0);
+}
+
+/*
+ * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
+ * request except that we need to reply to it.
+ */
+static void
+process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
+{
+	int rst_status = CPL_ABORT_NO_RST;
+	const struct cpl_abort_req_rss *req = cplhdr(m);
+	struct tcpcb *tp = toep->tp_tp; 
+	struct socket *so;
+	int needclose = 0;
+	
+	inp_wlock(tp->t_inpcb);
+	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
+	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
+		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
+		m_free(m);
+		goto skip;
+	}
+
+	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
+	/*
+	 * Three cases to consider:
+	 * a) We haven't sent an abort_req; close the connection.
+	 * b) We have sent a post-close abort_req that will get to TP too late
+	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
+	 *    be ignored and the connection should be closed now.
+	 * c) We have sent a regular abort_req that will get to TP too late.
+	 *    That will generate an abort_rpl with status 0, wait for it.
+	 */
+	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
+	    (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
+		int error;
+		
+		error = abort_status_to_errno(so, req->status,
+		    &rst_status);
+		so_error_set(so, error);
+
+		if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
+			so_sorwakeup(so);
+		/*
+		 * SYN_RECV needs special processing.  If abort_syn_rcv()
+		 * returns 0 is has taken care of the abort.
+		 */
+		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
+			goto skip;
+
+		t3_release_offload_resources(toep);
+		needclose = 1;
+	}
+	inp_wunlock(tp->t_inpcb);
+
+	if (needclose)
+		tcp_offload_close(tp);
+
+	send_abort_rpl(m, tdev, rst_status);
+	return;
+skip:
+	inp_wunlock(tp->t_inpcb);	
+}
+
+/*
+ * Handle an ABORT_REQ_RSS CPL message.
+ */
+static int
+do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	const struct cpl_abort_req_rss *req = cplhdr(m);
+	struct toepcb *toep = (struct toepcb *)ctx;
+	
+	if (is_neg_adv_abort(req->status)) {
+		m_free(m);
+		return (0);
+	}
+
+	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
+	
+	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
+		cxgb_remove_tid(cdev, toep, toep->tp_tid);
+		toep->tp_flags |= TP_ABORT_REQ_RCVD;
+		
+		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
+		if (toep->tp_l2t) 
+			l2t_release(L2DATA(cdev), toep->tp_l2t);
+
+		/*
+		 *  Unhook
+		 */
+		toep->tp_tp->t_toe = NULL;
+		toep->tp_tp->t_flags &= ~TF_TOE;
+		toep->tp_tp = NULL;
+		/*
+		 * XXX need to call syncache_chkrst - but we don't
+		 * have a way of doing that yet
+		 */
+		toepcb_release(toep);
+		log(LOG_ERR, "abort for unestablished connection :-(\n");
+		return (0);
+	}
+	if (toep->tp_tp == NULL) {
+		log(LOG_NOTICE, "disconnected toepcb\n");
+		/* should be freed momentarily */
+		return (0);
+	}
+
+
+	toepcb_hold(toep);
+	process_abort_req(toep, m, toep->tp_toedev);
+	toepcb_release(toep);
+	return (0);
+}
+#ifdef notyet
+static void
+pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
+{
+	struct toedev *tdev = TOE_DEV(parent);
+
+	do_abort_syn_rcv(child, parent);
+	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
+		struct cpl_pass_accept_rpl *rpl = cplhdr(m);
+
+		rpl->opt0h = htonl(F_TCAM_BYPASS);
+		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+	} else
+		m_free(m);
+}
+#endif
+static void
+handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
+{
+	CXGB_UNIMPLEMENTED();
+	
+#ifdef notyet	
+	struct t3cdev *cdev;
+	struct socket *parent;
+	struct socket *oreq;
+	struct t3c_tid_entry *t3c_stid;
+	struct tid_info *t;
+	struct tcpcb *otp, *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	/*
+	 * If the connection is being aborted due to the parent listening
+	 * socket going away there's nothing to do, the ABORT_REQ will close
+	 * the connection.
+	 */
+	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
+		m_free(m);
+		return;
+	}
+
+	oreq = so->so_incomp;
+	otp = so_sototcpcb(oreq);
+	
+	cdev = T3C_DEV(so);
+	t = &(T3C_DATA(cdev))->tid_maps;
+	t3c_stid = lookup_stid(t, otp->ts_recent);
+	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
+
+	so_lock(parent);
+	pass_open_abort(so, parent, m);
+	so_unlock(parent);
+#endif	
+}
+
+/*
+ * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
+ * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
+ * connection.
+ */
+static void
+pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
+{
+
+#ifdef notyet	
+	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
+#endif
+	handle_pass_open_arp_failure(m_get_socket(m), m);
+}
+
+/*
+ * Populate a reject CPL_PASS_ACCEPT_RPL WR.
+ */
+static void
+mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
+{
+	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
+	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
+	unsigned int tid = GET_TID(req);
+
+	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
+	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
+	rpl->opt0h = htonl(F_TCAM_BYPASS);
+	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+	rpl->opt2 = 0;
+	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
+}
+
+/*
+ * Send a deferred reject to an accept request.
+ */
+static void
+reject_pass_request(struct toedev *tdev, struct mbuf *m)
+{
+	struct mbuf *reply_mbuf;
+
+	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
+	mk_pass_accept_rpl(reply_mbuf, m);
+	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
+	m_free(m);
+}
+
+static void
+handle_syncache_event(int event, void *arg)
+{
+	struct toepcb *toep = arg;
+
+	switch (event) {
+	case TOE_SC_ENTRY_PRESENT:
+		/*
+		 * entry already exists - free toepcb
+		 * and l2t
+		 */
+		printf("syncache entry present\n");
+		toepcb_release(toep);
+		break;
+	case TOE_SC_DROP:
+		/*
+		 * The syncache has given up on this entry
+		 * either it timed out, or it was evicted
+		 * we need to explicitly release the tid
+		 */
+		printf("syncache entry dropped\n");
+		toepcb_release(toep);		
+		break;
+	default:
+		log(LOG_ERR, "unknown syncache event %d\n", event);
+		break;
+	}
+}
+
+static void
+syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
+{
+	struct in_conninfo inc;
+	struct tcpopt to;
+	struct tcphdr th;
+	struct inpcb *inp;
+	int mss, wsf, sack, ts;
+	uint32_t rcv_isn = ntohl(req->rcv_isn);
+	
+	bzero(&to, sizeof(struct tcpopt));
+	inp = so_sotoinpcb(lso);
+	
+	/*
+	 * Fill out information for entering us into the syncache
+	 */
+	inc.inc_fport = th.th_sport = req->peer_port;
+	inc.inc_lport = th.th_dport = req->local_port;
+	th.th_seq = req->rcv_isn;
+	th.th_flags = TH_SYN;
+
+	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
+
+	
+	inc.inc_isipv6 = 0;
+	inc.inc_len = 0;
+	inc.inc_faddr.s_addr = req->peer_ip;
+	inc.inc_laddr.s_addr = req->local_ip;
+
+	DPRINTF("syncache add of %d:%d %d:%d\n",
+	    ntohl(req->local_ip), ntohs(req->local_port),
+	    ntohl(req->peer_ip), ntohs(req->peer_port));
+	
+	mss = req->tcp_options.mss;
+	wsf = req->tcp_options.wsf;
+	ts = req->tcp_options.tstamp;
+	sack = req->tcp_options.sack;
+	to.to_mss = mss;
+	to.to_wscale = wsf;
+	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
+	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
+}
+
+
+/*
+ * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
+ * lock held.  Note that the sock here is a listening socket that is not owned
+ * by the TOE.
+ */
+static void
+process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
+    struct listen_ctx *lctx)
+{
+	int rt_flags;
+	struct l2t_entry *e;
+	struct iff_mac tim;
+	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
+	struct cpl_pass_accept_rpl *rpl;
+	struct cpl_pass_accept_req *req = cplhdr(m);
+	unsigned int tid = GET_TID(req);
+	struct tom_data *d = TOM_DATA(tdev);
+	struct t3cdev *cdev = d->cdev;
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *newtoep;
+	struct rtentry *dst;
+	struct sockaddr_in nam;
+	struct t3c_data *td = T3C_DATA(cdev);
+
+	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+	if (__predict_false(reply_mbuf == NULL)) {
+		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
+			t3_defer_reply(m, tdev, reject_pass_request);
+		else {
+			cxgb_queue_tid_release(cdev, tid);
+			m_free(m);
+		}
+		DPRINTF("failed to get reply_mbuf\n");
+		
+		goto out;
+	}
+
+	if (tp->t_state != TCPS_LISTEN) {
+		DPRINTF("socket not in listen state\n");
+		
+		goto reject;
+	}
+	
+	tim.mac_addr = req->dst_mac;
+	tim.vlan_tag = ntohs(req->vlan_tag);
+	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
+		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
+		goto reject;
+	}
+	
+#ifdef notyet
+	/*
+	 * XXX do route lookup to confirm that we're still listening on this
+	 * address
+	 */
+	if (ip_route_input(skb, req->local_ip, req->peer_ip,
+			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
+		goto reject;
+	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
+		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
+	dst_release(skb->dst);	// done with the input route, release it
+	skb->dst = NULL;
+	
+	if ((rt_flags & RTF_LOCAL) == 0)
+		goto reject;
+#endif
+	/*
+	 * XXX
+	 */
+	rt_flags = RTF_LOCAL;
+	if ((rt_flags & RTF_LOCAL) == 0)
+		goto reject;
+	
+	/*
+	 * Calculate values and add to syncache
+	 */
+
+	newtoep = toepcb_alloc();
+	if (newtoep == NULL)
+		goto reject;
+
+	bzero(&nam, sizeof(struct sockaddr_in));
+	
+	nam.sin_len = sizeof(struct sockaddr_in);
+	nam.sin_family = AF_INET;
+	nam.sin_addr.s_addr =req->peer_ip;
+	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
+
+	if (dst == NULL) {
+		printf("failed to find route\n");
+		goto reject;
+	}
+	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
+	    (struct sockaddr *)&nam);
+	if (e == NULL) {
+		DPRINTF("failed to get l2t\n");
+	}
+	/*
+	 * Point to our listen socket until accept
+	 */
+	newtoep->tp_tp = tp;
+	newtoep->tp_flags = TP_SYN_RCVD;
+	newtoep->tp_tid = tid;
+	newtoep->tp_toedev = tdev;
+	tp->rcv_wnd = select_rcv_wnd(tdev, so);
+	
+	cxgb_insert_tid(cdev, d->client, newtoep, tid);
+	so_lock(so);
+	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
+	so_unlock(so);
+
+	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
+		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
+
+	if (newtoep->tp_ulp_mode) {
+		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
+		
+		if (ddp_mbuf == NULL)
+			newtoep->tp_ulp_mode = 0;
+	}
+	
+	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
+	    TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
+	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
+	/*
+	 * XXX workaround for lack of syncache drop
+	 */
+	toepcb_hold(newtoep);
+	syncache_add_accept_req(req, so, newtoep);
+	
+	rpl = cplhdr(reply_mbuf);
+	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	rpl->wr.wr_lo = 0;
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
+	rpl->opt2 = htonl(calc_opt2(so, tdev));
+	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
+	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten
+
+	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
+	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
+	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
+				  CPL_PASS_OPEN_ACCEPT);
+
+	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
+	
+	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
+		
+	l2t_send(cdev, reply_mbuf, e);
+	m_free(m);
+	if (newtoep->tp_ulp_mode) {	
+		__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
+				V_TF_DDP_OFF(1) |
+				TP_DDP_TIMER_WORKAROUND_MASK,
+				V_TF_DDP_OFF(1) |
+		    TP_DDP_TIMER_WORKAROUND_VAL, 1);
+	} else
+		printf("not offloading\n");
+	
+	
+
+	return;
+reject:
+	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
+		mk_pass_accept_rpl(reply_mbuf, m);
+	else 
+		mk_tid_release(reply_mbuf, newtoep, tid);
+	cxgb_ofld_send(cdev, reply_mbuf);
+	m_free(m);
+out:
+#if 0
+	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+#else
+	return;
+#endif	
+}      
+
+/*
+ * Handle a CPL_PASS_ACCEPT_REQ message.
+ */
+static int
+do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+	struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
+	struct tom_data *d = listen_ctx->tom_data;
+
+#if VALIDATE_TID
+	struct cpl_pass_accept_req *req = cplhdr(m);
+	unsigned int tid = GET_TID(req);
+	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
+
+	if (unlikely(!lsk)) {
+		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
+		       cdev->name,
+		       (unsigned long)((union listen_entry *)ctx -
+					t->stid_tab));
+		return CPL_RET_BUF_DONE;
+	}
+	if (unlikely(tid >= t->ntids)) {
+		printk(KERN_ERR "%s: passive open TID %u too large\n",
+		       cdev->name, tid);
+		return CPL_RET_BUF_DONE;
+	}
+	/*
+	 * For T3A the current user of the TID may have closed but its last
+	 * message(s) may have been backlogged so the TID appears to be still
+	 * in use.  Just take the TID away, the connection can close at its
+	 * own leisure.  For T3B this situation is a bug.
+	 */
+	if (!valid_new_tid(t, tid) &&
+	    cdev->type != T3A) {
+		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
+		       cdev->name, tid);
+		return CPL_RET_BUF_DONE;
+	}
+#endif
+
+	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
+	return (0);
+}
+
+/*
+ * Called when a connection is established to translate the TCP options
+ * reported by HW to FreeBSD's native format.
+ */
+static void
+assign_rxopt(struct socket *so, unsigned int opt)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
+
+	inp_lock_assert(tp->t_inpcb);
+	
+	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
+	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
+	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
+	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
+	if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+	    (TF_RCVD_SCALE|TF_REQ_SCALE))
+		tp->rcv_scale = tp->request_r_scale;
+}
+
+/*
+ * Completes some final bits of initialization for just established connections
+ * and changes their state to TCP_ESTABLISHED.
+ *
+ * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
+ */
+static void
+make_established(struct socket *so, u32 snd_isn, unsigned int opt)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
+	assign_rxopt(so, opt);
+
+	/*
+	 *XXXXXXXXXXX
+	 * 
+	 */
+#ifdef notyet
+	so->so_proto->pr_ctloutput = t3_ctloutput;
+#endif
+	
+#if 0	
+	inet_sk(sk)->id = tp->write_seq ^ jiffies;
+#endif	
+	/*
+	 * XXX not clear what rcv_wup maps to
+	 */
+	/*
+	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
+	 * pass through opt0.
+	 */
+	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
+		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
+
+	dump_toepcb(toep);
+
+#ifdef notyet
+/*
+ * no clean interface for marking ARP up to date
+ */
+	dst_confirm(sk->sk_dst_cache);
+#endif
+	tp->t_starttime = ticks;
+	tp->t_state = TCPS_ESTABLISHED;
+	soisconnected(so);
+}
+
+static int
+syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
+{
+
+	struct in_conninfo inc;
+	struct tcpopt to;
+	struct tcphdr th;
+	int mss, wsf, sack, ts;
+	struct mbuf *m = NULL;
+	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
+	unsigned int opt;
+	
+#ifdef MAC
+#error	"no MAC support"
+#endif	
+	
+	opt = ntohs(req->tcp_opt);
+	
+	bzero(&to, sizeof(struct tcpopt));
+	
+	/*
+	 * Fill out information for entering us into the syncache
+	 */
+	inc.inc_fport = th.th_sport = req->peer_port;
+	inc.inc_lport = th.th_dport = req->local_port;
+	th.th_seq = req->rcv_isn;
+	th.th_flags = TH_ACK;
+	
+	inc.inc_isipv6 = 0;
+	inc.inc_len = 0;
+	inc.inc_faddr.s_addr = req->peer_ip;
+	inc.inc_laddr.s_addr = req->local_ip;
+	
+	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
+	wsf  = G_TCPOPT_WSCALE_OK(opt);
+	ts   = G_TCPOPT_TSTAMP(opt);
+	sack = G_TCPOPT_SACK(opt);
+	
+	to.to_mss = mss;
+	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
+	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
+
+	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
+	    ntohl(req->local_ip), ntohs(req->local_port),
+	    ntohl(req->peer_ip), ntohs(req->peer_port),
+	    mss, wsf, ts, sack);
+	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
+}
+
+
+/*
+ * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
+ * if we are in TCP_SYN_RECV due to crossed SYNs
+ */
+static int
+do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct cpl_pass_establish *req = cplhdr(m);
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so, *lso;
+	struct t3c_data *td = T3C_DATA(cdev);
+	struct sockbuf *snd, *rcv;
+	
+	// Complete socket initialization now that we have the SND_ISN
+	
+	struct toedev *tdev;
+
+
+	tdev = toep->tp_toedev;
+
+	inp_wlock(tp->t_inpcb);
+	
+	/*
+	 *
+	 * XXX need to add reference while we're manipulating
+	 */
+	so = lso = inp_inpcbtosocket(tp->t_inpcb);
+
+	inp_wunlock(tp->t_inpcb);
+
+	so_lock(so);
+	LIST_REMOVE(toep, synq_entry);
+	so_unlock(so);
+	
+	if (!syncache_expand_establish_req(req, &so, toep)) {
+		/*
+		 * No entry 
+		 */
+		CXGB_UNIMPLEMENTED();
+	}
+	if (so == NULL) {
+		/*
+		 * Couldn't create the socket
+		 */
+		CXGB_UNIMPLEMENTED();
+	}
+
+	tp = so_sototcpcb(so);
+	inp_wlock(tp->t_inpcb);
+
+	snd = so_sockbuf_snd(so);
+	rcv = so_sockbuf_rcv(so);
+
+	snd->sb_flags |= SB_NOCOALESCE;
+	rcv->sb_flags |= SB_NOCOALESCE;
+
+	toep->tp_tp = tp;
+	toep->tp_flags = 0;
+	tp->t_toe = toep;
+	reset_wr_list(toep);
+	tp->rcv_wnd = select_rcv_wnd(tdev, so);
+	tp->rcv_nxt = toep->tp_copied_seq;
+	install_offload_ops(so);
+	
+	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
+	toep->tp_wr_unacked = 0;
+	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
+	toep->tp_qset_idx = 0;
+	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
+	
+	/*
+	 * XXX Cancel any keep alive timer
+	 */
+	     
+	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+
+	/*
+	 * XXX workaround for lack of syncache drop
+	 */
+	toepcb_release(toep);
+	inp_wunlock(tp->t_inpcb);
+	
+	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
+	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
+#ifdef notyet
+	/*
+	 * XXX not sure how these checks map to us
+	 */
+	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
+		sk->sk_state_change(sk);
+		sk_wake_async(so, 0, POLL_OUT);
+	}
+	/*
+	 * The state for the new connection is now up to date.
+	 * Next check if we should add the connection to the parent's
+	 * accept queue.  When the parent closes it resets connections
+	 * on its SYN queue, so check if we are being reset.  If so we
+	 * don't need to do anything more, the coming ABORT_RPL will
+	 * destroy this socket.  Otherwise move the connection to the
+	 * accept queue.
+	 *
+	 * Note that we reset the synq before closing the server so if
+	 * we are not being reset the stid is still open.
+	 */
+	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
+		__kfree_skb(skb);
+		goto unlock;
+	}
+#endif
+	m_free(m);
+
+	return (0);
+}
+
+/*
+ * Fill in the right TID for CPL messages waiting in the out-of-order queue
+ * and send them to the TOE.
+ */
+static void
+fixup_and_send_ofo(struct toepcb *toep)
+{
+	struct mbuf *m;
+	struct toedev *tdev = toep->tp_toedev;
+	struct tcpcb *tp = toep->tp_tp;
+	unsigned int tid = toep->tp_tid;
+
+	log(LOG_NOTICE, "fixup_and_send_ofo\n");
+	
+	inp_lock_assert(tp->t_inpcb);
+	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
+		/*
+		 * A variety of messages can be waiting but the fields we'll
+		 * be touching are common to all so any message type will do.
+		 */
+		struct cpl_close_con_req *p = cplhdr(m);
+
+		p->wr.wr_lo = htonl(V_WR_TID(tid));
+		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
+		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
+	}
+}
+
+/*
+ * Updates socket state from an active establish CPL message.  Runs with the
+ * socket lock held.
+ */
+static void
+socket_act_establish(struct socket *so, struct mbuf *m)
+{
+	struct cpl_act_establish *req = cplhdr(m);
+	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
+		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
+		    toep->tp_tid, tp->t_state);
+
+	tp->ts_recent_age = ticks;
+	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
+	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
+
+	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+	
+	/*
+	 * Now that we finally have a TID send any CPL messages that we had to
+	 * defer for lack of a TID.
+	 */
+	if (mbufq_len(&toep->out_of_order_queue))
+		fixup_and_send_ofo(toep);
+
+	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
+		/*
+		 * XXX does this even make sense?
+		 */
+		so_sorwakeup(so);
+	}
+	m_free(m);
+#ifdef notyet
+/*
+ * XXX assume no write requests permitted while socket connection is
+ * incomplete
+ */
+	/*
+	 * Currently the send queue must be empty at this point because the
+	 * socket layer does not send anything before a connection is
+	 * established.  To be future proof though we handle the possibility
+	 * that there are pending buffers to send (either TX_DATA or
+	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
+	 * buffers according to the just learned write_seq, and then we send
+	 * them on their way.
+	 */
+	fixup_pending_writeq_buffers(sk);
+	if (t3_push_frames(so, 1))
+		sk->sk_write_space(sk);
+#endif
+
+	toep->tp_state = tp->t_state;
+	tcpstat.tcps_connects++;
+				
+}
+
+/*
+ * Process a CPL_ACT_ESTABLISH message.
+ */
+static int
+do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct cpl_act_establish *req = cplhdr(m);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	struct toepcb *toep = (struct toepcb *)ctx;
+	struct tcpcb *tp = toep->tp_tp;
+	struct socket *so; 
+	struct toedev *tdev;
+	struct tom_data *d;
+	
+	if (tp == NULL) {
+		free_atid(cdev, atid);
+		return (0);
+	}
+	inp_wlock(tp->t_inpcb);
+
+	/*
+	 * XXX
+	 */
+	so = inp_inpcbtosocket(tp->t_inpcb);
+	tdev = toep->tp_toedev; /* blow up here if link was down */
+	d = TOM_DATA(tdev);
+
+	/*
+	 * It's OK if the TID is currently in use, the owning socket may have
+	 * backlogged its last CPL message(s).  Just take it away.
+	 */
+	toep->tp_tid = tid;
+	toep->tp_tp = tp;
+	so_insert_tid(d, toep, tid);
+	free_atid(cdev, atid);
+	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
+
+	socket_act_establish(so, m);
+	inp_wunlock(tp->t_inpcb);
+	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
+	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
+
+	return (0);
+}
+
+/*
+ * Process an acknowledgment of WR completion.  Advance snd_una and send the
+ * next batch of work requests from the write queue.
+ */
+static void
+wr_ack(struct toepcb *toep, struct mbuf *m)
+{
+	struct tcpcb *tp = toep->tp_tp;
+	struct cpl_wr_ack *hdr = cplhdr(m);
+	struct socket *so;
+	unsigned int credits = ntohs(hdr->credits);
+	u32 snd_una = ntohl(hdr->snd_una);
+	int bytes = 0;
+	struct sockbuf *snd;
+	
+	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
+
+	inp_wlock(tp->t_inpcb);
+	so = inp_inpcbtosocket(tp->t_inpcb);
+	toep->tp_wr_avail += credits;
+	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
+		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
+
+	while (credits) {
+		struct mbuf *p = peek_wr(toep);
+		
+		if (__predict_false(!p)) {
+			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
+			    "nothing pending, state %u wr_avail=%u\n",
+			    credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
+			break;
+		}
+		CTR2(KTR_TOM,
+			"wr_ack: p->credits=%d p->bytes=%d",
+		    p->m_pkthdr.csum_data, p->m_pkthdr.len);
+		KASSERT(p->m_pkthdr.csum_data != 0,
+		    ("empty request still on list"));
+
+		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
+
+#if DEBUG_WR > 1
+			struct tx_data_wr *w = cplhdr(p);
+			log(LOG_ERR,
+			       "TID %u got %u WR credits, need %u, len %u, "
+			       "main body %u, frags %u, seq # %u, ACK una %u,"
+			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
+			       toep->tp_tid, credits, p->csum, p->len,
+			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
+			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
+			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
+#endif
+			p->m_pkthdr.csum_data -= credits;
+			break;
+		} else {
+			dequeue_wr(toep);
+			credits -= p->m_pkthdr.csum_data;
+			bytes += p->m_pkthdr.len;
+			CTR3(KTR_TOM,
+			    "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
+			    p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
+	
+			m_free(p);
+		}
+	}
+
+#if DEBUG_WR
+	check_wr_invariants(tp);
+#endif
+
+	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
+#if VALIDATE_SEQ
+		struct tom_data *d = TOM_DATA(TOE_DEV(so));
+
+		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
+		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
+		    toep->tp_tid, tp->snd_una);
+#endif
+		goto out_free;
+	}
+
+	if (tp->snd_una != snd_una) {
+		tp->snd_una = snd_una;
+		tp->ts_recent_age = ticks;
+#ifdef notyet
+		/*
+		 * Keep ARP entry "minty fresh"
+		 */
+		dst_confirm(sk->sk_dst_cache);
+#endif
+		if (tp->snd_una == tp->snd_nxt)
+			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
+	}
+
+	snd = so_sockbuf_snd(so);
+	if (bytes) {
+		CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
+		snd = so_sockbuf_snd(so);
+		sockbuf_lock(snd);		
+		sbdrop_locked(snd, bytes);
+		so_sowwakeup_locked(so);
+	}
+
+	if (snd->sb_sndptroff < snd->sb_cc)
+		t3_push_frames(so, 0);
+
+out_free:
+	inp_wunlock(tp->t_inpcb);
+	m_free(m);
+}
+
+/*
+ * Handler for TX_DATA_ACK CPL messages.
+ */
+static int
+do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
+{
+	struct toepcb *toep = (struct toepcb *)ctx;
+
+	VALIDATE_SOCK(so);
+
+	wr_ack(toep, m);
+	return 0;
+}
+
+/*
+ * Handler for TRACE_PKT CPL messages.  Just sink these packets.
+ */
+static int
+do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
+{
+	m_freem(m);
+	return 0;
+}
+
+/*
+ * Reset a connection that is on a listener's SYN queue or accept queue,
+ * i.e., one that has not had a struct socket associated with it.
+ * Must be called from process context.
+ *
+ * Modeled after code in inet_csk_listen_stop().
+ */
+static void
+t3_reset_listen_child(struct socket *child)
+{
+	struct tcpcb *tp = so_sototcpcb(child);
+	
+	t3_send_reset(tp->t_toe);
+}
+
+
+static void
+t3_child_disconnect(struct socket *so, void *arg)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+		
+	if (tp->t_flags & TF_TOE) {
+		inp_wlock(tp->t_inpcb);
+		t3_reset_listen_child(so);
+		inp_wunlock(tp->t_inpcb);
+	}	
+}
+
+/*
+ * Disconnect offloaded established but not yet accepted connections sitting
+ * on a server's accept_queue.  We just send an ABORT_REQ at this point and
+ * finish off the disconnect later as we may need to wait for the ABORT_RPL.
+ */
+void
+t3_disconnect_acceptq(struct socket *listen_so)
+{
+
+	so_lock(listen_so);
+	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
+	so_unlock(listen_so);
+}
+
+/*
+ * Reset offloaded connections sitting on a server's syn queue.  As above
+ * we send ABORT_REQ and finish off when we get ABORT_RPL.
+ */
+
+void
+t3_reset_synq(struct listen_ctx *lctx)
+{
+	struct toepcb *toep;
+
+	so_lock(lctx->lso);	
+	while (!LIST_EMPTY(&lctx->synq_head)) {
+		toep = LIST_FIRST(&lctx->synq_head);
+		LIST_REMOVE(toep, synq_entry);
+		toep->tp_tp = NULL;
+		t3_send_reset(toep);
+		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
+		toepcb_release(toep);
+	}
+	so_unlock(lctx->lso); 
+}
+
+
+int
+t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
+		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
+		   unsigned int pg_off, unsigned int color)
+{
+	unsigned int i, j, pidx;
+	struct pagepod *p;
+	struct mbuf *m;
+	struct ulp_mem_io *req;
+	unsigned int tid = toep->tp_tid;
+	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
+	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
+
+	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
+	    gl, nppods, tag, maxoff, pg_off, color);
+	
+	for (i = 0; i < nppods; ++i) {
+		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
+		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+		req = mtod(m, struct ulp_mem_io *);
+		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
+		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+		req->wr.wr_lo = 0;
+		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
+					   V_ULPTX_CMD(ULP_MEM_WRITE));
+		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
+				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
+
+		p = (struct pagepod *)(req + 1);
+		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
+			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
+			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
+						  V_PPOD_COLOR(color));
+			p->pp_max_offset = htonl(maxoff);
+			p->pp_page_offset = htonl(pg_off);
+			p->pp_rsvd = 0;
+			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
+				p->pp_addr[j] = pidx < gl->dgl_nelem ?
+				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
+		} else
+			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
+		send_or_defer(toep, m, 0);
+		ppod_addr += PPOD_SIZE;
+	}
+	return (0);
+}
+
+/*
+ * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
+ */
+static inline void
+mk_cpl_barrier_ulp(struct cpl_barrier *b)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
+
+	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
+	b->opcode = CPL_BARRIER;
+}
+
+/*
+ * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
+ */
+static inline void
+mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
+
+	txpkt = (struct ulp_txpkt *)req;
+	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
+	req->cpuno = htons(cpuno);
+}
+
+/*
+ * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
+ */
+static inline void
+mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
+                     unsigned int word, uint64_t mask, uint64_t val)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
+	
+	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
+	    tid, word, mask, val);
+	
+	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply = V_NO_REPLY(1);
+	req->cpu_idx = 0;
+	req->word = htons(word);
+	req->mask = htobe64(mask);
+	req->val = htobe64(val);
+}
+
+/*
+ * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
+ */
+static void
+mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
+    unsigned int tid, unsigned int credits)
+{
+	struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
+
+	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
+	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
+	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
+	ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
+	    V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
+				 V_RX_CREDITS(credits));
+}
+
+void
+t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
+{
+	unsigned int wrlen;
+	struct mbuf *m;
+	struct work_request_hdr *wr;
+	struct cpl_barrier *lock;
+	struct cpl_set_tcb_field *req;
+	struct cpl_get_tcb *getreq;
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+#if 0
+	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
+#endif
+	wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
+		sizeof(*getreq);
+	m = m_gethdr_nofail(wrlen);
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	wr = mtod(m, struct work_request_hdr *);
+	bzero(wr, wrlen);
+	
+	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+	m->m_pkthdr.len = m->m_len = wrlen;
+
+	lock = (struct cpl_barrier *)(wr + 1);
+	mk_cpl_barrier_ulp(lock);
+
+	req = (struct cpl_set_tcb_field *)(lock + 1);
+
+	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
+
+	/* Hmmm, not sure if this actually a good thing: reactivating
+	 * the other buffer might be an issue if it has been completed
+	 * already. However, that is unlikely, since the fact that the UBUF
+	 * is not completed indicates that there is no oustanding data.
+	 */
+	if (bufidx == 0)
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+				     V_TF_DDP_ACTIVE_BUF(1) |
+				     V_TF_DDP_BUF0_VALID(1),
+				     V_TF_DDP_ACTIVE_BUF(1));
+	else
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+				     V_TF_DDP_ACTIVE_BUF(1) |
+				     V_TF_DDP_BUF1_VALID(1), 0);
+
+	getreq = (struct cpl_get_tcb *)(req + 1);
+	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
+
+	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
+
+	/* Keep track of the number of oustanding CPL_GET_TCB requests
+	 */
+	p->get_tcb_count++;
+	
+#ifdef T3_TRACE
+	T3_TRACE1(TIDTB(so),
+		  "t3_cancel_ddpbuf: bufidx %u", bufidx);
+#endif
+	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+}
+
+/**
+ * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
+ * @sk: the socket associated with the buffers
+ * @bufidx: index of HW DDP buffer (0 or 1)
+ * @tag0: new tag for HW buffer 0
+ * @tag1: new tag for HW buffer 1
+ * @len: new length for HW buf @bufidx
+ *
+ * Sends a compound WR to overlay a new DDP buffer on top of an existing
+ * buffer by changing the buffer tag and length and setting the valid and
+ * active flag accordingly.  The caller must ensure the new buffer is at
+ * least as big as the existing one.  Since we typically reprogram both HW
+ * buffers this function sets both tags for convenience. Read the TCB to
+ * determine how made data was written into the buffer before the overlay
+ * took place.
+ */
+void
+t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
+	 	       unsigned int tag1, unsigned int len)
+{
+	unsigned int wrlen;
+	struct mbuf *m;
+	struct work_request_hdr *wr;
+	struct cpl_get_tcb *getreq;
+	struct cpl_set_tcb_field *req;
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
+	    bufidx, tag0, tag1, len);
+#if 0
+	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
+#endif	
+	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
+	m = m_gethdr_nofail(wrlen);
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	wr = mtod(m, struct work_request_hdr *);
+	m->m_pkthdr.len = m->m_len = wrlen;
+	bzero(wr, wrlen);
+
+	
+	/* Set the ATOMIC flag to make sure that TP processes the following
+	 * CPLs in an atomic manner and no wire segments can be interleaved.
+	 */
+	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
+	req = (struct cpl_set_tcb_field *)(wr + 1);
+	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
+			     V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
+			     V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
+			     V_TCB_RX_DDP_BUF0_TAG(tag0) |
+			     V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
+	req++;
+	if (bufidx == 0) {
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
+			    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+			    V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
+		req++;
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+			    V_TF_DDP_PUSH_DISABLE_0(1) |
+			    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
+			    V_TF_DDP_PUSH_DISABLE_0(0) |
+			    V_TF_DDP_BUF0_VALID(1));
+	} else {
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
+			    V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
+			    V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
+		req++;
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
+			    V_TF_DDP_PUSH_DISABLE_1(1) |
+			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
+			    V_TF_DDP_PUSH_DISABLE_1(0) |
+			    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
+	}
+
+	getreq = (struct cpl_get_tcb *)(req + 1);
+	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
+
+	/* Keep track of the number of oustanding CPL_GET_TCB requests
+	 */
+	p->get_tcb_count++;
+
+#ifdef T3_TRACE
+	T3_TRACE4(TIDTB(sk),
+		  "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
+		  "len %d",
+		  bufidx, tag0, tag1, len);
+#endif
+	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+}
+
+/*
+ * Sends a compound WR containing all the CPL messages needed to program the
+ * two HW DDP buffers, namely optionally setting up the length and offset of
+ * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
+ */
+void
+t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
+		      unsigned int len1, unsigned int offset1,
+                      uint64_t ddp_flags, uint64_t flag_mask, int modulate)
+{
+	unsigned int wrlen;
+	struct mbuf *m;
+	struct work_request_hdr *wr;
+	struct cpl_set_tcb_field *req;
+
+	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
+	    len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
+	
+#if 0
+	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
+#endif
+	wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
+		(len1 ? sizeof(*req) : 0) +
+		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
+	m = m_gethdr_nofail(wrlen);
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
+	wr = mtod(m, struct work_request_hdr *);
+	bzero(wr, wrlen);
+	
+	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+	m->m_pkthdr.len = m->m_len = wrlen;
+
+	req = (struct cpl_set_tcb_field *)(wr + 1);
+	if (len0) {                  /* program buffer 0 offset and length */
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
+			V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
+			V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+			V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
+			V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
+		req++;
+	}
+	if (len1) {                  /* program buffer 1 offset and length */
+		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
+			V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
+			V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
+			V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
+			V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
+		req++;
+	}
+
+	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
+			     ddp_flags);
+
+	if (modulate) {
+		mk_rx_data_ack_ulp(toep,
+		    (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
+		    toep->tp_copied_seq - toep->tp_rcv_wup);
+		toep->tp_rcv_wup = toep->tp_copied_seq;
+	}
+
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(sk),
+		  "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
+		  "modulate %d",
+		  len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
+		  modulate);
+#endif
+
+	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
+}
+
+void
+t3_init_wr_tab(unsigned int wr_len)
+{
+	int i;
+
+	if (mbuf_wrs[1])     /* already initialized */
+		return;
+
+	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
+		int sgl_len = (3 * i) / 2 + (i & 1);
+
+		sgl_len += 3;
+		mbuf_wrs[i] = sgl_len <= wr_len ?
+		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
+	}
+
+	wrlen = wr_len * 8;
+}
+
+int
+t3_init_cpl_io(void)
+{
+#ifdef notyet
+	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
+	if (!tcphdr_skb) {
+		log(LOG_ERR,
+		       "Chelsio TCP offload: can't allocate sk_buff\n");
+		return -1;
+	}
+	skb_put(tcphdr_skb, sizeof(struct tcphdr));
+	tcphdr_skb->h.raw = tcphdr_skb->data;
+	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
+#endif
+	
+	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
+	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
+	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
+	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
+	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
+	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
+	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
+	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
+	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
+	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
+	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
+	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
+	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
+	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
+	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
+	return (0);
+}
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
new file mode 100644
index 0000000000000..77a3d760f54f7
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
@@ -0,0 +1,1030 @@
+/**************************************************************************
+
+Copyright (c) 2007-2008, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/condvar.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+
+#include <machine/bus.h>
+#include <machine/cpu.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+#include <dev/cxgb/cxgb_config.h>
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_offload.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_offload.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+
+#include <dev/cxgb/sys/mvec.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
+
+
+static int	(*pru_sosend)(struct socket *so, struct sockaddr *addr,
+    struct uio *uio, struct mbuf *top, struct mbuf *control,
+    int flags, struct thread *td);
+
+static int	(*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
+    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
+    int *flagsp);
+
+#define TMP_IOV_MAX 16
+#ifndef PG_FRAME
+#define PG_FRAME	~PAGE_MASK
+#endif
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+
+void
+t3_init_socket_ops(void)
+{
+	struct protosw *prp;
+
+	prp = pffindtype(AF_INET, SOCK_STREAM);
+	pru_sosend = prp->pr_usrreqs->pru_sosend;
+	pru_soreceive = prp->pr_usrreqs->pru_soreceive;
+}
+
+struct cxgb_dma_info {
+	size_t			cdi_mapped;
+	int			cdi_nsegs;
+	bus_dma_segment_t	*cdi_segs;
+	
+};
+
+static void
+cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
+    bus_size_t mapsize, int error)
+{
+	struct cxgb_dma_info *cdi = arg;
+	
+	cdi->cdi_mapped = mapsize;
+	cdi->cdi_nsegs = nsegs;
+	cdi->cdi_segs = segs;
+}
+
+static void
+iov_adj(struct iovec **iov, int *iovcnt, size_t count)
+{
+	struct iovec *iovtmp;
+	int iovcnttmp;
+	caddr_t ptmp;
+	
+	if (count > 0) {
+		iovtmp = *iov;
+		iovcnttmp = *iovcnt;
+		while (count > 0) {
+			if (count < iovtmp->iov_len) {
+				ptmp = iovtmp->iov_base;
+				ptmp += count; 
+				iovtmp->iov_base = ptmp;
+				iovtmp->iov_len -= count;
+				break;
+			} else 
+				count -= iovtmp->iov_len;
+			iovtmp++;
+			iovcnttmp--;
+		} 
+		*iov = iovtmp;
+		*iovcnt = iovcnttmp;
+	} else if (count < 0) {
+		iovtmp = &(*iov)[*iovcnt - 1];
+		iovcnttmp = *iovcnt;
+		while (count < 0) {
+			if (-count < iovtmp->iov_len) {
+				iovtmp->iov_len += count;
+				break;
+			} else
+				count += iovtmp->iov_len;
+			iovtmp--;
+			iovcnttmp--;
+		}
+		*iovcnt = iovcnttmp;
+	}
+}
+
+static void
+cxgb_zero_copy_free(void *cl, void *arg)
+{
+	struct mbuf_vec *mv;
+	struct mbuf *m = (struct mbuf *)cl;
+
+	mv = mtomv(m);
+	/*
+	 * Physical addresses, don't try to free should be unheld separately from sbdrop
+	 *
+	 */
+	mv->mv_count = 0;
+	m_free_iovec(m, m->m_type);
+}
+
+
+static int
+cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
+{
+	struct iovec *iov = uio->uio_iov;
+	int iovcnt = uio->uio_iovcnt;
+	int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
+	uint64_t start, end;
+	vm_page_t *mp;
+	
+	totbytes = totcount = 0;
+	maxcount = *held;
+
+	mp = m;
+	for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount);  i++, iov++) {
+		count = maxcount - totcount;
+		    
+		start = (uintptr_t)iov->iov_base;
+		end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len);
+		start &= PG_FRAME;
+		end += PAGE_MASK;
+		end &= PG_FRAME;
+		npages = (end - start) >> PAGE_SHIFT;
+		
+		count = min(count, npages);
+
+		err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags);
+		if (err) {
+			vm_fault_unhold_pages(m, totcount);
+			return (err);
+		}
+		mp += count;
+		totcount += count;
+		curbytes = iov->iov_len;
+		if (count != npages)
+			curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK);
+		totbytes += curbytes;
+	}
+	uio->uio_resid -= totbytes;
+
+	return (0);
+}
+
+/*
+ * Returns whether a connection should enable DDP.  This happens when all of
+ * the following conditions are met:
+ * - the connection's ULP mode is DDP
+ * - DDP is not already enabled
+ * - the last receive was above the DDP threshold
+ * - receive buffers are in user space
+ * - receive side isn't shutdown (handled by caller)
+ * - the connection's receive window is big enough so that sizable buffers
+ *   can be posted without closing the window in the middle of DDP (checked
+ *   when the connection is offloaded)
+ */
+static int
+so_should_ddp(const struct toepcb *toep, int last_recv_len)
+{
+
+	DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n",
+	    toep->tp_ulp_mode, last_recv_len,  TOM_TUNABLE(toep->tp_toedev, ddp_thres),
+	    toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN));
+
+	return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) &&
+	       last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
+	       toep->tp_tp->rcv_wnd > 
+	           (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
+}
+
+static inline int
+is_ddp(const struct mbuf *m)
+{
+	return ((m->m_flags & M_DDP) != 0);
+}
+
+static inline int
+is_ddp_psh(const struct mbuf *m)
+{
+        return ((is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH)) != 0);
+}
+
+static int
+m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+	int curlen, startlen, resid_init, err = 0;
+	caddr_t buf;
+
+	DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n",
+	    m, offset, len);
+
+	startlen = len;
+	resid_init = uio->uio_resid;
+	while (m && len) {
+		buf = mtod(m, caddr_t);
+		curlen = m->m_len;
+		if (offset && (offset < curlen)) {
+			curlen -= offset;
+			buf += offset;
+			offset = 0;
+		} else if (offset) {
+			offset -= curlen;
+			m = m->m_next;
+			continue;
+		}
+		err = uiomove(buf, min(len, curlen), uio);
+		if (err) {
+			printf("uiomove returned %d\n", err);
+			return (err);
+		}
+		
+		len -= min(len, curlen);
+		m = m->m_next;
+	}
+	DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n",
+	    startlen - len, resid_init, uio->uio_resid);
+	return (err);
+}
+
+/*
+ * Copy data from an sk_buff to an iovec.  Deals with RX_DATA, which carry the
+ * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
+ * DDP buffer.
+ */
+static inline int
+copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+	struct iovec *to = uio->uio_iov;
+	int err;
+	
+	if (__predict_true(!is_ddp(m)))                              /* RX_DATA */
+		return m_uiomove(m, offset, len, uio);
+	if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
+		to->iov_len -= len;
+		to->iov_base = ((caddr_t)to->iov_base) + len;
+		uio->uio_iov = to;
+		uio->uio_resid -= len;
+		return (0);
+	}
+	err = t3_ddp_copy(m, offset, uio, len);             /* kernel DDP */
+	return (err);
+}
+
+static void
+cxgb_wait_dma_completion(struct toepcb *toep)
+{
+	struct rwlock *lock;
+	
+	lock = &toep->tp_tp->t_inpcb->inp_lock;
+	inp_wlock(toep->tp_tp->t_inpcb);
+	cv_wait_unlock(&toep->tp_cv, lock);
+}
+
+static int
+cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
+{
+	int i, seg_count, err, type;
+	struct mbuf *m0;
+	struct cxgb_dma_info cdi;
+	struct mbuf_vec *mv;
+	struct mbuf_iovec *mi;
+	bus_dma_segment_t *segs;
+	
+	err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
+	    cxgb_dma_callback, &cdi, 0);
+
+	if (err)
+		return (err);
+	seg_count = cdi.cdi_nsegs;	
+	if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
+		bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
+		return (ENOMEM);
+	}
+	segs = cdi.cdi_segs;
+	m0->m_type = type;
+	m0->m_flags = (M_EXT|M_NOFREE);
+	m0->m_ext.ext_type = EXT_EXTREF;
+	m0->m_ext.ext_free = cxgb_zero_copy_free;
+	m0->m_ext.ext_arg1 = NULL;	/* XXX: probably wrong /phk */
+	m0->m_ext.ext_arg2 = NULL;
+    
+	mv = mtomv(m0);
+	mv->mv_count = seg_count;
+	mv->mv_first = 0;
+	for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
+		mi_collapse_sge(mi, segs);
+
+	*m = m0;
+
+	/*
+	 * This appears to be a no-op at the moment
+	 * as busdma is all or nothing need to make
+	 * sure the tag values are large enough
+	 *
+	 */
+	if (cdi.cdi_mapped < uio->uio_resid) {
+		uio->uio_resid -= cdi.cdi_mapped;
+	} else
+		uio->uio_resid = 0;
+
+	return (0);
+}
+
+static int
+t3_sosend(struct socket *so, struct uio *uio)
+{
+	int rv, count, hold_resid, sent, iovcnt;
+	struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct mbuf *m;
+	struct uio uiotmp;
+	struct sockbuf *snd;
+	
+	/*
+	 * Events requiring iteration:
+	 *  - number of pages exceeds max hold pages for process or system
+	 *  - number of pages exceeds maximum sg entries for a single WR
+	 *
+	 * We're limited to holding 128 pages at once - and we're limited to
+	 * 34 SG entries per work request, but each SG entry can be any number 
+	 * of contiguous pages
+	 *
+	 */
+
+	uiotmp = *uio;
+	iovcnt = uio->uio_iovcnt;
+	iov = uio->uio_iov;
+	sent = 0;
+	snd = so_sockbuf_snd(so);
+sendmore:
+	/*
+	 * Make sure we don't exceed the socket buffer
+	 */
+	count = min(toep->tp_page_count, (sockbuf_sbspace(snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
+	rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0);
+	hold_resid = uiotmp.uio_resid;
+	if (rv)
+		return (rv);
+
+	/*
+	 * Bump past sent and shave off the unheld amount
+	 */
+	if (hold_resid  > 0) {
+		iovtmpp = iovtmp;
+		memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
+		if (sent)
+			iov_adj(&iovtmpp, &iovcnt, sent);
+		iov_adj(&iovtmpp, &iovcnt, -hold_resid);
+		uiotmp.uio_iov = iovtmpp;
+		uiotmp.uio_iovcnt = iovcnt;
+
+	}
+	uiotmp.uio_resid = uio->uio_resid - hold_resid;
+	
+	/*
+	 * Push off all held pages
+	 *
+	 */
+	while (uiotmp.uio_resid > 0) {
+		rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
+		if (rv) {
+			vm_fault_unhold_pages(toep->tp_pages, count);
+			return (rv);
+		}
+		uio->uio_resid -= m->m_pkthdr.len;
+		sent += m->m_pkthdr.len;
+		sbappend(snd, m);
+		t3_push_frames(so, TRUE);
+		iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
+	}
+
+	/*
+	 * Wait for pending I/O to be DMA'd to the card 
+	 * 
+	 */
+	cxgb_wait_dma_completion(toep);
+	vm_fault_unhold_pages(toep->tp_pages, count);
+	/*
+	 * If there is more data to send adjust local copy of iov
+	 * to point to teh start
+	 */
+	if (hold_resid) {
+		iovtmpp = iovtmp;
+		memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
+		iov_adj(&iovtmpp, &iovcnt, sent);
+		uiotmp = *uio;
+		uiotmp.uio_iov = iovtmpp;
+		uiotmp.uio_iovcnt = iovcnt;
+		goto sendmore;
+	}
+
+	return (0);
+}
+
+static int
+cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toedev *tdev; 
+	int zcopy_thres, zcopy_enabled, rv;
+
+	/*
+	 * In order to use DMA direct from userspace the following
+	 * conditions must be met:
+	 *  - the connection is currently offloaded
+	 *  - ddp is enabled
+	 *  - the number of bytes to be transferred exceeds the threshold
+	 *  - the number of bytes currently in flight won't exceed the in-flight
+	 *    threshold XXX TODO
+	 *  - vm_fault_hold_user_pages succeeds
+	 *  - blocking socket XXX for now
+	 *
+	 */
+	if (tp && tp->t_flags & TF_TOE) {
+		struct toepcb *toep = tp->t_toe;
+		
+		tdev = toep->tp_toedev;
+		zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
+		zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
+
+		if (uio && (uio->uio_resid > zcopy_thres) &&
+		    (uio->uio_iovcnt < TMP_IOV_MAX) &&  ((so_state_get(so) & SS_NBIO) == 0)
+		    && zcopy_enabled) {
+			rv = t3_sosend(so, uio);
+			if (rv != EAGAIN)
+				return (rv);
+		}
+	}
+	return pru_sosend(so, addr, uio, top, control, flags, td);
+}
+
+/*
+ * Following replacement or removal of the first mbuf on the first mbuf chain
+ * of a socket buffer, push necessary state changes back into the socket
+ * buffer so that other consumers see the values consistently.  'nextrecord'
+ * is the callers locally stored value of the original value of
+ * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
+ * NOTE: 'nextrecord' may be NULL.
+ */
+static __inline void
+sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
+{
+	sockbuf_lock_assert(sb);
+	/*
+	 * First, update for the new value of nextrecord.  If necessary, make
+	 * it the first record.
+	 */
+	if (sb->sb_mb != NULL)
+		sb->sb_mb->m_nextpkt = nextrecord;
+	else
+		sb->sb_mb = nextrecord;
+
+        /*
+         * Now update any dependent socket buffer fields to reflect the new
+         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
+	 * addition of a second clause that takes care of the case where
+	 * sb_mb has been updated, but remains the last record.
+         */
+        if (sb->sb_mb == NULL) {
+                sb->sb_mbtail = NULL;
+                sb->sb_lastrecord = NULL;
+        } else if (sb->sb_mb->m_nextpkt == NULL)
+                sb->sb_lastrecord = sb->sb_mb;
+}
+
+#define IS_NONBLOCKING(so)	(so_state_get(so) & SS_NBIO)
+
+static int
+t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
+{
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct mbuf *m;
+	uint32_t offset;
+	int err, flags, avail, len, copied, copied_unacked;
+	int target;		/* Read at least this many bytes */
+	int user_ddp_ok;
+	struct ddp_state *p;
+	struct inpcb *inp = so_sotoinpcb(so);
+	int socket_state, socket_error;
+	struct sockbuf *rcv;
+	
+	avail = offset = copied = copied_unacked = 0;
+	flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
+	rcv = so_sockbuf_rcv(so);
+	
+	err = sblock(rcv, SBLOCKWAIT(flags));
+	p = &toep->tp_ddp_state;
+
+	if (err)
+		return (err);
+
+	rcv = so_sockbuf_rcv(so);
+	sockbuf_lock(rcv);
+	if ((tp->t_flags & TF_TOE) == 0) {
+		sockbuf_unlock(rcv);
+		err = EAGAIN;
+		goto done_unlocked;
+	}
+	
+	p->user_ddp_pending = 0;
+restart:
+	if ((tp->t_flags & TF_TOE) == 0) {
+		sockbuf_unlock(rcv);
+		err = EAGAIN;
+		goto done_unlocked;
+	}
+
+	len = uio->uio_resid;
+	m = rcv->sb_mb;
+	target = (flags & MSG_WAITALL) ? len : rcv->sb_lowat;
+	user_ddp_ok = p->ubuf_ddp_ready;
+	p->cancel_ubuf = 0;
+	
+	if (len == 0)
+		goto done;
+	if (m) 
+		goto got_mbuf;
+
+	/* empty receive queue */
+	if (copied >= target && (rcv->sb_mb == NULL) &&
+	    !p->user_ddp_pending)
+		goto done;
+
+	socket_state = so_state_get(so);
+	socket_error = so_error_get(so);
+	rcv = so_sockbuf_rcv(so);
+	
+	if (copied) {
+		if (socket_error || tp->t_state == TCPS_CLOSED || 
+		    (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
+			goto done;
+	} else {
+		if (socket_state & SS_NOFDREF)
+			goto done;
+		if (socket_error) {
+			err = socket_error;
+			socket_error = 0;
+			goto done;
+		}
+		if (rcv->sb_state & SBS_CANTRCVMORE) 
+			goto done;
+		if (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
+			goto done;
+		if (tp->t_state == TCPS_CLOSED) {
+			err = ENOTCONN; 
+			goto done;
+		}
+	}
+	if (rcv->sb_mb && !p->user_ddp_pending) {
+		sockbuf_unlock(rcv);
+		inp_wlock(inp);
+		t3_cleanup_rbuf(tp, copied_unacked);
+		inp_wunlock(inp);
+		sockbuf_lock(rcv);
+		copied_unacked = 0;
+		goto restart;
+	}
+	if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending && 
+	    uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
+	    p->ubuf_ddp_ready) {
+		p->user_ddp_pending =
+		    !t3_overlay_ubuf(toep, rcv, uio,
+			IS_NONBLOCKING(so), flags, 1, 1);
+		if (p->user_ddp_pending) {
+			p->kbuf_posted++;
+			user_ddp_ok = 0;
+		}
+	}
+	if (p->kbuf[0] && (p->kbuf_posted == 0)) {
+		t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
+		p->kbuf_posted++;
+	}
+	if (p->user_ddp_pending) {
+		/* One shot at DDP if we already have enough data */
+		if (copied >= target)
+			user_ddp_ok = 0;
+
+		if (rcv->sb_state & SBS_CANTRCVMORE) 
+			goto done;
+		CTR0(KTR_TOM, "ddp pending -- waiting");
+		if ((err = sbwait(rcv)) != 0)
+			goto done;
+//for timers to work			await_ddp_completion(sk, flags, &timeo);
+	} else if (copied >= target)
+		goto done;
+	else {
+		if (copied_unacked) {
+			int i = 0;
+
+			sockbuf_unlock(rcv);
+			inp_wlock(inp);
+			t3_cleanup_rbuf(tp, copied_unacked);
+			inp_wunlock(inp);
+			copied_unacked = 0;
+			if (mp_ncpus > 1)
+				while (i++ < 200 && rcv->sb_mb == NULL)
+					cpu_spinwait();
+			sockbuf_lock(rcv);
+		}
+		if (rcv->sb_mb)
+			goto restart;
+
+		if (rcv->sb_state & SBS_CANTRCVMORE)
+			goto done;
+
+		CTR0(KTR_TOM, "no buffers -- waiting");
+
+		if ((err = sbwait(rcv)) != 0) 
+			goto done;
+	}
+     	goto restart;
+got_mbuf:
+	/*
+	 * Adjust the mbuf seqno if it has already been partially processed by
+	 * soreceive_generic
+	 */
+	if (m->m_pkthdr.len != m->m_len) {
+		m->m_seq += m->m_pkthdr.len - m->m_len;
+		m->m_pkthdr.len = m->m_len;
+	}
+	    
+	CTR6(KTR_TOM, "t3_soreceive: ddp_flags=0x%x m_len=%u resid=%u "
+	    "m_seq=0x%08x c_seq=0x%08x c_unack=%u",
+	    (is_ddp(m) ? m->m_ddp_flags : 0), m->m_pkthdr.len, len,
+	    m->m_seq, toep->tp_copied_seq, copied_unacked);
+	KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT),
+	    ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT),
+		m->m_ext.ext_type, m->m_len, m->m_pkthdr.len));
+	KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p"
+		" m_flags=0x%x m->m_len=%d", m->m_next, m->m_nextpkt, m->m_flags, m->m_len));
+	if (m->m_pkthdr.len == 0) {
+		if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
+			panic("empty mbuf and NOCOPY not set\n");
+		CTR0(KTR_TOM, "ddp done notification");
+		p->user_ddp_pending = 0;
+		sbdroprecord_locked(rcv);
+		goto done;
+	}
+
+	KASSERT((int32_t)(toep->tp_copied_seq + copied_unacked - m->m_seq) >= 0,
+	    ("offset will go negative: offset=%d copied_seq=0x%08x copied_unacked=%d m_seq=0x%08x",
+		offset, toep->tp_copied_seq, copied_unacked, m->m_seq));
+	offset = toep->tp_copied_seq + copied_unacked - m->m_seq;
+	
+	if (offset >= m->m_pkthdr.len)
+		panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x "
+		    "seq 0x%x pktlen %d ddp flags 0x%x", offset,
+		    toep->tp_copied_seq + copied_unacked, m->m_seq,
+		    m->m_pkthdr.len, m->m_ddp_flags);
+
+	avail = m->m_pkthdr.len - offset;
+	if (len < avail) {
+		if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY)) 
+			panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset);
+		avail = len;
+		rcv->sb_flags |= SB_IN_TOE;
+	} else if (p->kbuf_posted == 0 && p->user_ddp_pending == 0)
+		rcv->sb_flags &= ~SB_IN_TOE;
+		
+#ifdef URGENT_DATA_SUPPORTED
+	/*
+	 * Check if the data we are preparing to copy contains urgent
+	 * data.  Either stop short of urgent data or skip it if it's
+	 * first and we are not delivering urgent data inline.
+	 */
+	if (__predict_false(toep->tp_urg_data)) {
+		uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked;
+		
+		if (urg_offset < avail) {
+			if (urg_offset) {
+				/* stop short of the urgent data */
+				avail = urg_offset;
+			} else if ((so_options_get(so) & SO_OOBINLINE) == 0) {
+				/* First byte is urgent, skip */
+				toep->tp_copied_seq++;
+				offset++;
+				avail--;
+				if (!avail)
+					goto skip_copy;
+			}	
+		}	
+	}	
+#endif
+	if (is_ddp_psh(m) || offset || (rcv->sb_mb && !is_ddp(m))) {
+		user_ddp_ok = 0;
+#ifdef T3_TRACE	
+		T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
+#endif	
+	}
+	
+	if (user_ddp_ok && !p->user_ddp_pending &&
+	    uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
+	    p->ubuf_ddp_ready) {
+		p->user_ddp_pending = 
+		    !t3_overlay_ubuf(toep, rcv, uio,
+			IS_NONBLOCKING(so), flags, 1, 1);
+		if (p->user_ddp_pending) {
+			p->kbuf_posted++;
+			user_ddp_ok = 0;
+		}
+		DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending);
+	} else
+		DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n",
+		    user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0,
+		    p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted);
+	
+	/*
+	 * If MSG_TRUNC is specified the data is discarded.
+	 * XXX need to check pr_atomic
+	 */
+	KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail,  uio->uio_resid, offset));
+	if (__predict_true(!(flags & MSG_TRUNC))) {
+		int resid = uio->uio_resid;
+		
+		sockbuf_unlock(rcv);
+		if ((err = copy_data(m, offset, avail, uio))) {
+			if (err)
+				err = EFAULT;
+			goto done_unlocked;
+		}
+			    
+		sockbuf_lock(rcv);
+		if (avail != (resid - uio->uio_resid))
+			printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n",
+			    avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m));
+
+		if ((tp->t_flags & TF_TOE) == 0) {
+			sockbuf_unlock(rcv);
+			err = EAGAIN;
+			goto done_unlocked;
+		}
+	}
+	
+	copied += avail;
+	copied_unacked += avail;
+	len -= avail;
+	
+#ifdef URGENT_DATA_SUPPORTED
+skip_copy:
+	if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq))
+		tp->urg_data = 0;
+#endif
+	/*
+	 * If the buffer is fully consumed free it.  If it's a DDP
+	 * buffer also handle any events it indicates.
+	 */
+	if (avail + offset >= m->m_pkthdr.len) {
+		unsigned int fl = m->m_ddp_flags;
+		int exitnow, got_psh = 0, nomoredata = 0;
+		int count;
+		struct mbuf *nextrecord;
+
+		if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) {
+			if (is_ddp_psh(m) && p->user_ddp_pending)
+				got_psh = 1;
+			
+			if (fl & DDP_BF_NOCOPY)
+				p->user_ddp_pending = 0;
+			else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) {
+				p->kbuf_posted--;
+				nomoredata = 1;
+			} else {
+				p->kbuf_posted--;
+				p->ubuf_ddp_ready = 1;
+			}
+		}
+
+		nextrecord = m->m_nextpkt;
+		count = m->m_pkthdr.len;
+		while (count > 0) {
+			count -= m->m_len;
+			KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
+			CTR2(KTR_TOM, "freeing mbuf m_len = %d pktlen = %d", m->m_len, m->m_pkthdr.len);
+			sbfree(rcv, m);
+			rcv->sb_mb = m_free(m);
+			m = rcv->sb_mb;
+		}
+		sockbuf_pushsync(rcv, nextrecord);
+#if 0
+		sbdrop_locked(rcv, m->m_pkthdr.len);
+#endif		
+		exitnow = got_psh || nomoredata;
+		if  (copied >= target && (rcv->sb_mb == NULL) && exitnow)
+			goto done;
+		if (copied_unacked > (rcv->sb_hiwat >> 2)) {
+			sockbuf_unlock(rcv);
+			inp_wlock(inp);
+			t3_cleanup_rbuf(tp, copied_unacked);
+			inp_wunlock(inp);
+			copied_unacked = 0;
+			sockbuf_lock(rcv);
+		}
+	} 
+	if (len > 0)
+		goto restart;
+
+	done:
+	if ((tp->t_flags & TF_TOE) == 0) {
+		sockbuf_unlock(rcv);
+		err = EAGAIN;
+		goto done_unlocked;
+	}
+	/*
+	 * If we can still receive decide what to do in preparation for the
+	 * next receive.  Note that RCV_SHUTDOWN is set if the connection
+	 * transitioned to CLOSE but not if it was in that state to begin with.
+	 */
+	if (__predict_true((so_state_get(so) & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
+		if (p->user_ddp_pending) {
+			user_ddp_ok = 0;
+			t3_cancel_ubuf(toep, rcv);
+			if (rcv->sb_mb) {
+				if (copied < 0)
+					copied = 0;
+				if (len > 0)
+					goto restart;
+			}
+			p->user_ddp_pending = 0;
+		}
+		if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) {
+#ifdef T3_TRACE
+			T3_TRACE0(TIDTB(so),
+			  "chelsio_recvmsg: about to exit, repost kbuf");
+#endif
+
+			t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
+			p->kbuf_posted++;
+		} else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) {
+			CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid);
+			if (!t3_enter_ddp(toep, TOM_TUNABLE(toep->tp_toedev,
+				    ddp_copy_limit), 0, IS_NONBLOCKING(so))) {
+				rcv->sb_flags |= SB_IN_TOE;
+				p->kbuf_posted = 1;
+			}
+			
+		}
+	}
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(so),
+		  "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
+		  "kbuf_posted %d user_ddp_pending %u",
+		  copied, len, buffers_freed, p ? p->kbuf_posted : -1, 
+	    p->user_ddp_pending);
+#endif
+	sockbuf_unlock(rcv);
+done_unlocked:	
+	if (copied_unacked && (tp->t_flags & TF_TOE)) {
+		inp_wlock(inp);
+		t3_cleanup_rbuf(tp, copied_unacked);
+		inp_wunlock(inp);
+	}
+	sbunlock(rcv);
+
+	return (err);
+}
+
+static int
+cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	struct toedev *tdev;
+	int rv, zcopy_thres, zcopy_enabled, flags;
+	struct tcpcb *tp = so_sototcpcb(so);
+	struct sockbuf *rcv = so_sockbuf_rcv(so);
+	
+	flags = flagsp ? *flagsp &~ MSG_EOR : 0;
+	
+	/*
+	 * In order to use DMA direct from userspace the following
+	 * conditions must be met:
+	 *  - the connection is currently offloaded
+	 *  - ddp is enabled
+	 *  - the number of bytes to be transferred exceeds the threshold
+	 *  - the number of bytes currently in flight won't exceed the in-flight
+	 *    threshold XXX TODO
+	 *  - vm_fault_hold_user_pages succeeds
+	 *  - blocking socket XXX for now
+	 *  - iovcnt is 1
+	 *
+	 */
+	if (tp && (tp->t_flags & TF_TOE) && uio && ((flags & (MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
+	    && (uio->uio_iovcnt == 1) && (mp0 == NULL) &&
+	    ((rcv->sb_flags & SB_IN_TOE) || (uio->uio_iovcnt == 1))) {
+		struct toepcb *toep = tp->t_toe;
+		
+		tdev =  toep->tp_toedev;
+		zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
+		zcopy_enabled = TOM_TUNABLE(tdev, ddp);
+		if ((rcv->sb_flags & SB_IN_TOE) ||((uio->uio_resid > zcopy_thres) &&
+			(uio->uio_iovcnt == 1) && zcopy_enabled)) {
+			CTR4(KTR_TOM, "cxgb_soreceive: sb_flags=0x%x t_flags=0x%x flags=0x%x uio_resid=%d",
+			    rcv->sb_flags, tp->t_flags, flags, uio->uio_resid);
+			rv = t3_soreceive(so, flagsp, uio);
+			if (rv != EAGAIN)
+				return (rv);
+			else
+				printf("returned EAGAIN\n");
+		} 
+	} else if (tp && (tp->t_flags & TF_TOE) && uio && mp0 == NULL) {
+		struct sockbuf *rcv = so_sockbuf_rcv(so);
+		
+		log(LOG_INFO, "skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n",
+		    flags, uio->uio_iovcnt, rcv->sb_state);
+	}
+	
+	return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
+}
+
+struct protosw cxgb_protosw;
+struct pr_usrreqs cxgb_tcp_usrreqs;
+
+
+void
+t3_install_socket_ops(struct socket *so)
+{
+	static int copied = 0;
+	struct pr_usrreqs *pru;
+	struct protosw *psw;
+	
+	if (copied == 0) {
+		psw = so_protosw_get(so);	
+		pru = psw->pr_usrreqs;
+
+		bcopy(psw, &cxgb_protosw, sizeof(*psw));
+		bcopy(pru, &cxgb_tcp_usrreqs, sizeof(*pru));
+
+		cxgb_protosw.pr_ctloutput = t3_ctloutput;
+		cxgb_protosw.pr_usrreqs = &cxgb_tcp_usrreqs;
+		cxgb_tcp_usrreqs.pru_sosend = cxgb_sosend;
+		cxgb_tcp_usrreqs.pru_soreceive = cxgb_soreceive;
+	}
+	so_protosw_set(so, &cxgb_protosw);
+	
+#if 0	
+	so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
+	so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;
+#endif
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
new file mode 100644
index 0000000000000..86e1e91b98271
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c
@@ -0,0 +1,738 @@
+/**************************************************************************
+
+Copyright (c) 2007-2008, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/condvar.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/uio.h>
+
+#include <machine/bus.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_offload.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_offload.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+
+#include <dev/cxgb/sys/mvec.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
+
+
+#define MAX_SCHEDULE_TIMEOUT	300
+
+/*
+ * Return the # of page pods needed to accommodate a # of pages.
+ */
+static inline unsigned int
+pages2ppods(unsigned int pages)
+{
+	return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS;
+}
+
+/**
+ *	t3_pin_pages - pin a user memory range and prepare it for DDP
+ *	@addr - the starting address
+ *	@len - the length of the range
+ *	@newgl - contains the pages and physical addresses of the pinned range
+ *	@gl - an existing gather list, may be %NULL
+ *
+ *	Pins the pages in the user-space memory range [addr, addr + len) and
+ *	maps them for DMA.  Returns a gather list with the pinned pages and
+ *	their physical addresses.  If @gl is non NULL the pages it describes
+ *	are compared against the pages for [addr, addr + len), and if the
+ *	existing gather list already covers the range a new list is not
+ *	allocated.  Returns 0 on success, or a negative errno.  On success if
+ *	a new gather list was allocated it is returned in @newgl.
+ */ 
+static int
+t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, vm_offset_t addr,
+    size_t len, struct ddp_gather_list **newgl,
+    const struct ddp_gather_list *gl)
+{
+	int i = 0, err;
+	size_t pg_off;
+	unsigned int npages;
+	struct ddp_gather_list *p;
+
+	/*
+	 * XXX need x86 agnostic check
+	 */
+	if (addr + len > VM_MAXUSER_ADDRESS)
+		return (EFAULT);
+
+	pg_off = addr & PAGE_MASK;
+	npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *),
+	    M_DEVBUF, M_NOWAIT|M_ZERO);
+	if (p == NULL)
+		return (ENOMEM);
+
+	err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE);
+	if (err)
+		goto free_gl;
+
+	if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages &&
+	    gl->dgl_length >= len) {
+		for (i = 0; i < npages; i++)
+			if (p->dgl_pages[i] != gl->dgl_pages[i])
+				goto different_gl;
+		err = 0;
+		goto unpin;
+	}
+
+different_gl:
+	p->dgl_length = len;
+	p->dgl_offset = pg_off;
+	p->dgl_nelem = npages;
+#ifdef NEED_BUSDMA
+	p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off,
+				       PAGE_SIZE - pg_off,
+				       PCI_DMA_FROMDEVICE) - pg_off;
+	for (i = 1; i < npages; ++i)
+		p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE,
+					       PCI_DMA_FROMDEVICE);
+#endif	
+	*newgl = p;
+	return (0);
+unpin:
+	vm_fault_unhold_pages(p->dgl_pages, npages);
+
+free_gl:
+	
+	free(p, M_DEVBUF);
+	*newgl = NULL;
+	return (err);
+}
+
+static void
+unmap_ddp_gl(const struct ddp_gather_list *gl)
+{
+#ifdef NEED_BUSDMA	
+	int i;
+
+	if (!gl->nelem)
+		return;
+
+	pci_unmap_page(pdev, gl->phys_addr[0] + gl->offset,
+		       PAGE_SIZE - gl->offset, PCI_DMA_FROMDEVICE);
+	for (i = 1; i < gl->nelem; ++i)
+		pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE,
+			       PCI_DMA_FROMDEVICE);
+
+#endif
+}
+
+static void
+ddp_gl_free_pages(struct ddp_gather_list *gl, int dirty)
+{
+	/*
+	 * XXX mark pages as dirty before unholding 
+	 */
+	vm_fault_unhold_pages(gl->dgl_pages, gl->dgl_nelem);
+}
+
+void
+t3_free_ddp_gl(struct ddp_gather_list *gl)
+{
+	unmap_ddp_gl(gl);
+	ddp_gl_free_pages(gl, 0);
+	free(gl, M_DEVBUF);
+}
+
+/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */
+#define MAX_PPODS 64U
+
+/*
+ * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in
+ * the TCB.  We allocate page pods in multiples of PPOD_CLUSTER_SIZE.  First we
+ * try to allocate enough page pods to accommodate the whole buffer, subject to
+ * the MAX_PPODS limit.  If that fails we try to allocate PPOD_CLUSTER_SIZE page
+ * pods before failing entirely.
+ */
+static int
+alloc_buf1_ppods(struct toepcb *toep, struct ddp_state *p,
+			    unsigned long addr, unsigned int len)
+{
+	int err, tag, npages, nppods;
+	struct tom_data *d = TOM_DATA(toep->tp_toedev);
+
+#if 0
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+#endif	
+	npages = ((addr & PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nppods = min(pages2ppods(npages), MAX_PPODS);
+	nppods = roundup2(nppods, PPOD_CLUSTER_SIZE);
+	err = t3_alloc_ppods(d, nppods, &tag);
+	if (err && nppods > PPOD_CLUSTER_SIZE) {
+		nppods = PPOD_CLUSTER_SIZE;
+		err = t3_alloc_ppods(d, nppods, &tag);
+	}
+	if (err)
+		return (ENOMEM);
+
+	p->ubuf_nppods = nppods;
+	p->ubuf_tag = tag;
+#if NUM_DDP_KBUF == 1
+	t3_set_ddp_tag(toep, 1, tag << 6);
+#endif
+	return (0);
+}
+
+/*
+ * Starting offset for the user DDP buffer.  A non-0 value ensures a DDP flush
+ * won't block indefinitely if there's nothing to place (which should be rare).
+ */
+#define UBUF_OFFSET 1
+
+static __inline unsigned long
+select_ddp_flags(const struct toepcb *toep, int buf_idx,
+                 int nonblock, int rcv_flags)
+{
+	if (buf_idx == 1) {
+		if (__predict_false(rcv_flags & MSG_WAITALL))
+			return V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+			       V_TF_DDP_PSH_NO_INVALIDATE1(1) |
+			       V_TF_DDP_PUSH_DISABLE_1(1);
+		if (nonblock)
+			return V_TF_DDP_BUF1_FLUSH(1);
+
+		return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(toep->tp_toedev,
+							ddp_push_wait));
+	}
+
+	if (__predict_false(rcv_flags & MSG_WAITALL))
+		return V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+		       V_TF_DDP_PSH_NO_INVALIDATE1(1) |
+		       V_TF_DDP_PUSH_DISABLE_0(1);
+	if (nonblock)
+		return V_TF_DDP_BUF0_FLUSH(1);
+
+	return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(toep->tp_toedev, ddp_push_wait));
+}
+
+/*
+ * Reposts the kernel DDP buffer after it has been previously become full and
+ * invalidated.  We just need to reset the offset and adjust the DDP flags.
+ * Conveniently, we can set the flags and the offset with a single message.
+ * Note that this function does not set the buffer length.  Again conveniently
+ * our kernel buffer is of fixed size.  If the length needs to be changed it
+ * needs to be done separately.
+ */
+static void
+t3_repost_kbuf(struct toepcb *toep, unsigned int bufidx, int modulate, 
+    int activate, int nonblock)
+{
+	struct ddp_state *p = &toep->tp_ddp_state;
+	unsigned long flags;
+
+#if 0	
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+#endif	
+	p->buf_state[bufidx].cur_offset = p->kbuf[bufidx]->dgl_offset;
+	p->buf_state[bufidx].flags = p->kbuf_noinval ? DDP_BF_NOINVAL : 0;
+	p->buf_state[bufidx].gl = p->kbuf[bufidx];
+	p->cur_buf = bufidx;
+	p->kbuf_idx = bufidx;
+
+	flags = select_ddp_flags(toep, bufidx, nonblock, 0);
+	if (!bufidx)
+		t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags |
+			 V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) |
+			 V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) |
+		         V_TF_DDP_BUF0_VALID(1),
+		         V_TF_DDP_BUF0_FLUSH(1) |
+			 V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+		         V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) |
+			 V_TF_DDP_BUF0_VALID(1) |
+			 V_TF_DDP_ACTIVE_BUF(activate), modulate);
+	else
+		t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags |
+			 V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) |	
+		         V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) | 
+			 V_TF_DDP_BUF1_VALID(1) | 
+			 V_TF_DDP_ACTIVE_BUF(activate),
+		         V_TF_DDP_BUF1_FLUSH(1) | 
+			 V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+		         V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) |
+			 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 
+			 modulate);
+	
+}
+
+/**
+ * setup_uio_ppods - setup HW page pods for a user iovec
+ * @sk: the associated socket
+ * @uio: the uio
+ * @oft: additional bytes to map before the start of the buffer
+ *
+ * Pins a user iovec and sets up HW page pods for DDP into it.  We allocate
+ * page pods for user buffers on the first call per socket.  Afterwards we
+ * limit the buffer length to whatever the existing page pods can accommodate.
+ * Returns a negative error code or the length of the mapped buffer.
+ *
+ * The current implementation handles iovecs with only one entry.
+ */
+static int
+setup_uio_ppods(struct toepcb *toep, const struct uio *uio, int oft, int *length)
+{
+	int err;
+	unsigned int len;
+	struct ddp_gather_list *gl = NULL;
+	struct ddp_state *p = &toep->tp_ddp_state;
+	struct iovec *iov = uio->uio_iov;
+	vm_offset_t addr = (vm_offset_t)iov->iov_base - oft;
+
+#ifdef notyet	
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+#endif
+	if (__predict_false(p->ubuf_nppods == 0)) {
+		err = alloc_buf1_ppods(toep, p, addr, iov->iov_len + oft);
+		if (err)
+			return (err);
+	}
+
+	len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE;
+	len -= addr & PAGE_MASK;
+	if (len > M_TCB_RX_DDP_BUF0_LEN)
+		len = M_TCB_RX_DDP_BUF0_LEN;
+	len = min(len, toep->tp_tp->rcv_wnd - 32768);
+	len = min(len, iov->iov_len + oft);
+
+	if (len <= p->kbuf[0]->dgl_length) {
+		printf("length too short\n");
+		return (EINVAL);
+	}
+	
+	err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf);
+	if (err)
+		return (err);
+	if (gl) {
+		if (p->ubuf)
+			t3_free_ddp_gl(p->ubuf);
+		p->ubuf = gl;
+		t3_setup_ppods(toep, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len,
+			       gl->dgl_offset, 0);
+	}
+	*length = len;
+	return (0);
+}
+
+/*
+ * 
+ */
+void
+t3_cancel_ubuf(struct toepcb *toep, struct sockbuf *rcv)
+{
+	struct ddp_state *p = &toep->tp_ddp_state;
+	int ubuf_pending = t3_ddp_ubuf_pending(toep);
+	int err = 0, count = 0;
+	
+	if (p->ubuf == NULL)
+		return;
+	
+	sockbuf_lock_assert(rcv);
+
+	p->cancel_ubuf = 1;
+	while (ubuf_pending && !(rcv->sb_state & SBS_CANTRCVMORE)) {
+		CTR3(KTR_TOM,
+		  "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d",
+		  p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), 
+		  p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY),
+		  p->get_tcb_count);	
+		if (p->get_tcb_count == 0)
+			t3_cancel_ddpbuf(toep, p->cur_buf);
+		else
+			CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d rcv=%p SBS_CANTRCVMORE=%d",
+			    err, p->get_tcb_count, rcv->sb_timeo, rcv,
+			    !!(rcv->sb_state & SBS_CANTRCVMORE));
+		
+		while (p->get_tcb_count && !(rcv->sb_state & SBS_CANTRCVMORE)) {
+			if (count & 0xfffffff)
+				CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d rcv=%p count=%d",
+				    err, p->get_tcb_count, rcv->sb_timeo, rcv, count);
+			count++;
+			err = sbwait(rcv);
+		}
+		ubuf_pending = t3_ddp_ubuf_pending(toep);
+	}
+	p->cancel_ubuf = 0;
+	p->user_ddp_pending = 0;
+
+}
+
+#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE0(1) | \
+	              V_TF_DDP_PSH_NO_INVALIDATE1(1) | \
+		      V_TF_DDP_BUF1_FLUSH(1) | \
+		      V_TF_DDP_BUF0_FLUSH(1) | \
+		      V_TF_DDP_PUSH_DISABLE_1(1) | \
+		      V_TF_DDP_PUSH_DISABLE_0(1) | \
+		      V_TF_DDP_INDICATE_OUT(1))
+
+/*
+ * Post a user buffer as an overlay on top of the current kernel buffer.
+ */
+int
+t3_overlay_ubuf(struct toepcb *toep, struct sockbuf *rcv,
+    const struct uio *uio, int nonblock, int rcv_flags,
+    int modulate, int post_kbuf)
+{
+	int err, len, ubuf_idx;
+	unsigned long flags;
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+	if (p->kbuf[0] == NULL) {
+		return (EINVAL);
+	}
+	sockbuf_unlock(rcv);
+	err = setup_uio_ppods(toep, uio, 0, &len);
+	sockbuf_lock(rcv);
+	if (err)
+		return (err);
+	
+	if ((rcv->sb_state & SBS_CANTRCVMORE) ||
+	    (toep->tp_tp->t_flags & TF_TOE) == 0) 
+		return (EINVAL);
+		
+	ubuf_idx = p->kbuf_idx;
+	p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP;
+	/* Use existing offset */
+	/* Don't need to update .gl, user buffer isn't copied. */
+	p->cur_buf = ubuf_idx;
+
+	flags = select_ddp_flags(toep, ubuf_idx, nonblock, rcv_flags);
+
+	if (post_kbuf) {
+		struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1];
+		
+		dbs->cur_offset = 0;
+		dbs->flags = 0;
+		dbs->gl = p->kbuf[ubuf_idx ^ 1];
+		p->kbuf_idx ^= 1;
+		flags |= p->kbuf_idx ?
+		    V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) :
+		    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0);
+	}
+	
+	if (ubuf_idx == 0) {
+		t3_overlay_ddpbuf(toep, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6,
+				  len);
+		t3_setup_ddpbufs(toep, 0, 0, p->kbuf[1]->dgl_length, 0,
+				 flags,
+				 OVERLAY_MASK | flags, 1);
+	} else {
+		t3_overlay_ddpbuf(toep, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6,
+				  len);
+		t3_setup_ddpbufs(toep, p->kbuf[0]->dgl_length, 0, 0, 0,
+				 flags,
+				 OVERLAY_MASK | flags, 1);
+	}
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(so),
+		  "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d "
+		  " kbuf_idx %d",
+		   p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx);
+#endif
+	CTR3(KTR_TOM,
+	    "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x",
+	    p->ubuf_tag, flags, OVERLAY_MASK);
+	CTR3(KTR_TOM,
+	    "t3_overlay_ubuf:  ubuf_idx %d kbuf_idx %d post_kbuf %d",
+	    ubuf_idx, p->kbuf_idx, post_kbuf);
+	    
+	return (0);
+}
+
+/*
+ * Clean up DDP state that needs to survive until socket close time, such as the
+ * DDP buffers.  The buffers are already unmapped at this point as unmapping
+ * needs the PCI device and a socket may close long after the device is removed.
+ */
+void
+t3_cleanup_ddp(struct toepcb *toep)
+{
+	struct ddp_state *p = &toep->tp_ddp_state;
+	int idx;
+
+	for (idx = 0; idx < NUM_DDP_KBUF; idx++)
+		if (p->kbuf[idx]) {
+			ddp_gl_free_pages(p->kbuf[idx], 0);
+			free(p->kbuf[idx], M_DEVBUF);
+		}
+	if (p->ubuf) {
+		ddp_gl_free_pages(p->ubuf, 0);
+		free(p->ubuf, M_DEVBUF);
+		p->ubuf = NULL;
+	}
+	toep->tp_ulp_mode = 0;
+}
+
+/*
+ * This is a companion to t3_cleanup_ddp() and releases the HW resources
+ * associated with a connection's DDP state, such as the page pods.
+ * It's called when HW is done with a connection.   The rest of the state
+ * remains available until both HW and the app are done with the connection.
+ */
+void
+t3_release_ddp_resources(struct toepcb *toep)
+{
+	struct ddp_state *p = &toep->tp_ddp_state;
+	struct tom_data *d = TOM_DATA(toep->tp_toedev);
+	int idx;
+	
+	for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
+		t3_free_ppods(d, p->kbuf_tag[idx], 
+		    p->kbuf_nppods[idx]);
+		unmap_ddp_gl(p->kbuf[idx]);
+	}
+
+	if (p->ubuf_nppods) {
+		t3_free_ppods(d, p->ubuf_tag, p->ubuf_nppods);
+		p->ubuf_nppods = 0;
+	}
+	if (p->ubuf)
+		unmap_ddp_gl(p->ubuf);
+	
+}
+
+void
+t3_post_kbuf(struct toepcb *toep, int modulate, int nonblock)
+{
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+	t3_set_ddp_tag(toep, p->cur_buf, p->kbuf_tag[p->cur_buf] << 6);
+	t3_set_ddp_buf(toep, p->cur_buf, 0, p->kbuf[p->cur_buf]->dgl_length);
+	t3_repost_kbuf(toep, p->cur_buf, modulate, 1, nonblock);
+#ifdef T3_TRACE
+	T3_TRACE1(TIDTB(so),
+		  "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf);
+#endif
+	CTR1(KTR_TOM,
+		  "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf);
+}
+
+/*
+ * Prepare a socket for DDP.  Must be called when the socket is known to be
+ * open.
+ */
+int
+t3_enter_ddp(struct toepcb *toep, unsigned int kbuf_size, unsigned int waitall, int nonblock)
+{
+	int i, err = ENOMEM;
+	static vm_pindex_t color;
+	unsigned int nppods, kbuf_pages, idx = 0;
+	struct ddp_state *p = &toep->tp_ddp_state;
+	struct tom_data *d = TOM_DATA(toep->tp_toedev);
+
+	
+	if (kbuf_size > M_TCB_RX_DDP_BUF0_LEN)
+		return (EINVAL);
+
+#ifdef notyet	
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+#endif	
+	kbuf_pages = (kbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nppods = pages2ppods(kbuf_pages);
+
+	p->kbuf_noinval = !!waitall;
+	p->kbuf_tag[NUM_DDP_KBUF - 1] = -1;
+	for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
+		p->kbuf[idx] = 
+		    malloc(sizeof (struct ddp_gather_list) + kbuf_pages *
+			sizeof(vm_page_t *), M_DEVBUF, M_NOWAIT|M_ZERO);
+		if (p->kbuf[idx] == NULL)
+			goto err;
+		err = t3_alloc_ppods(d, nppods, &p->kbuf_tag[idx]);
+		if (err) {
+			printf("t3_alloc_ppods failed err=%d\n", err);
+			goto err;
+		}
+		
+		p->kbuf_nppods[idx] = nppods;
+		p->kbuf[idx]->dgl_length = kbuf_size;
+		p->kbuf[idx]->dgl_offset = 0;
+		p->kbuf[idx]->dgl_nelem = kbuf_pages;
+
+		for (i = 0; i < kbuf_pages; ++i) {
+			p->kbuf[idx]->dgl_pages[i] = vm_page_alloc(NULL, color,
+			    VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED |
+			    VM_ALLOC_ZERO);
+			if (p->kbuf[idx]->dgl_pages[i] == NULL) {
+				p->kbuf[idx]->dgl_nelem = i;
+				printf("failed to allocate kbuf pages\n");
+				goto err;
+			}
+		}
+#ifdef NEED_BUSDMA
+		/*
+		 * XXX we'll need this for VT-d or any platform with an iommu :-/
+		 *
+		 */
+		for (i = 0; i < kbuf_pages; ++i)
+			p->kbuf[idx]->phys_addr[i] = 
+			    pci_map_page(p->pdev, p->kbuf[idx]->pages[i],
+					 0, PAGE_SIZE, PCI_DMA_FROMDEVICE);
+#endif
+		t3_setup_ppods(toep, p->kbuf[idx], nppods, p->kbuf_tag[idx], 
+			       p->kbuf[idx]->dgl_length, 0, 0);
+	}
+	cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid);
+
+	t3_set_ddp_tag(toep, 0, p->kbuf_tag[0] << 6);
+	t3_set_ddp_buf(toep, 0, 0, p->kbuf[0]->dgl_length);
+	t3_repost_kbuf(toep, 0, 0, 1, nonblock);
+
+	t3_set_rcv_coalesce_enable(toep, 
+	    TOM_TUNABLE(toep->tp_toedev, ddp_rcvcoalesce));
+	t3_set_dack_mss(toep, TOM_TUNABLE(toep->tp_toedev, delack)>>1);
+	
+#ifdef T3_TRACE
+	T3_TRACE4(TIDTB(so),
+		  "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d",
+		   kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]);
+#endif
+	CTR4(KTR_TOM,
+		  "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d",
+		   kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]);
+	cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid);
+	return (0);
+
+err:
+	t3_release_ddp_resources(toep);
+	t3_cleanup_ddp(toep);
+	return (err);
+}
+
+int
+t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len)
+{
+	int resid_init, err;
+	struct ddp_gather_list *gl = (struct ddp_gather_list *)m->m_ddp_gl;
+	
+	resid_init = uio->uio_resid;
+	
+	if (!gl->dgl_pages)
+		panic("pages not set\n");
+
+	CTR4(KTR_TOM, "t3_ddp_copy: offset=%d dgl_offset=%d cur_offset=%d len=%d",
+	    offset, gl->dgl_offset, m->m_cur_offset, len);
+	offset += gl->dgl_offset + m->m_cur_offset;
+	KASSERT(len <= gl->dgl_length,
+	    ("len=%d > dgl_length=%d in ddp_copy\n", len, gl->dgl_length));
+
+
+	err = uiomove_fromphys(gl->dgl_pages, offset, len, uio);
+	return (err);
+}
+
+
+/*
+ * Allocate n page pods.  Returns -1 on failure or the page pod tag.
+ */
+int
+t3_alloc_ppods(struct tom_data *td, unsigned int n, int *ptag)
+{
+	unsigned int i, j;
+
+	if (__predict_false(!td->ppod_map)) {
+		printf("ppod_map not set\n");
+		return (EINVAL);
+	}
+
+	mtx_lock(&td->ppod_map_lock);
+	for (i = 0; i < td->nppods; ) {
+		
+		for (j = 0; j < n; ++j)           /* scan ppod_map[i..i+n-1] */
+			if (td->ppod_map[i + j]) {
+				i = i + j + 1;
+				goto next;
+			}
+		memset(&td->ppod_map[i], 1, n);   /* allocate range */
+		mtx_unlock(&td->ppod_map_lock);
+		CTR2(KTR_TOM,
+		    "t3_alloc_ppods: n=%u tag=%u", n, i);
+		*ptag = i;
+		return (0);
+	next: ;
+	}
+	mtx_unlock(&td->ppod_map_lock);
+	return (0);
+}
+
+void
+t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n)
+{
+	/* No need to take ppod_lock here */
+	memset(&td->ppod_map[tag], 0, n);
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_defs.h b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
new file mode 100644
index 0000000000000..8c14f5ae89c87
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_defs.h
@@ -0,0 +1,90 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef CXGB_DEFS_H_
+#define CXGB_DEFS_H_
+
+#define VALIDATE_TID 0
+
+#define TOEPCB(so)  ((struct toepcb *)(sototcpcb((so))->t_toe))
+#define TOE_DEV(so) (TOEPCB((so))->tp_toedev)
+#define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket)
+#define sototoep(so) (sototcpcb((so))->t_toe)
+
+#define TRACE_ENTER printf("%s:%s entered\n", __FUNCTION__, __FILE__)
+#define TRACE_EXIT printf("%s:%s:%d exited\n", __FUNCTION__, __FILE__, __LINE__)
+	
+#define	KTR_TOM	KTR_SPARE2
+#define	KTR_TCB	KTR_SPARE3
+
+struct toepcb;
+struct listen_ctx;
+
+typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m);
+
+void t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h);
+void t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
+void t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
+int t3_push_frames(struct socket *so, int req_completion);
+int t3_connect(struct toedev *tdev, struct socket *so, struct rtentry *rt,
+	struct sockaddr *nam);
+void t3_init_listen_cpl_handlers(void);
+int t3_init_cpl_io(void);
+void t3_init_wr_tab(unsigned int wr_len);
+uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail);
+void t3_send_rx_modulate(struct toepcb *toep);
+void t3_cleanup_rbuf(struct tcpcb *tp, int copied);
+
+void t3_init_socket_ops(void);
+void t3_install_socket_ops(struct socket *so);
+
+
+void t3_disconnect_acceptq(struct socket *listen_so);
+void t3_reset_synq(struct listen_ctx *ctx);
+void t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler);
+
+struct toepcb *toepcb_alloc(void);
+void toepcb_hold(struct toepcb *);
+void toepcb_release(struct toepcb *);
+void toepcb_init(struct toepcb *);
+
+void t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off);
+void t3_set_dack_mss(struct toepcb *toep, int on);
+void t3_set_keepalive(struct toepcb *toep, int on_off);
+void t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag);
+void t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
+		    unsigned int len);
+int t3_get_tcb(struct toepcb *toep);
+
+int t3_ctloutput(struct socket *so, struct sockopt *sopt);
+
+#endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_l2t.c b/sys/dev/cxgb/ulp/tom/cxgb_l2t.c
new file mode 100644
index 0000000000000..ab5fbe740114b
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_l2t.c
@@ -0,0 +1,542 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+ 
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#if __FreeBSD_version > 700000
+#include <sys/rwlock.h>
+#endif
+
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <net/if.h>
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#endif
+
+#define VLAN_NONE 0xfff
+#define SDL(s) ((struct sockaddr_dl *)s) 
+#define RT_ENADDR(sa)  ((u_char *)LLADDR(SDL((sa))))
+#define rt_expire rt_rmx.rmx_expire 
+
+struct llinfo_arp { 
+        struct  callout la_timer; 
+        struct  rtentry *la_rt; 
+        struct  mbuf *la_hold;  /* last packet until resolved/timeout */ 
+        u_short la_preempt;     /* countdown for pre-expiry arps */ 
+        u_short la_asked;       /* # requests sent */ 
+}; 
+
+/*
+ * Module locking notes:  There is a RW lock protecting the L2 table as a
+ * whole plus a spinlock per L2T entry.  Entry lookups and allocations happen
+ * under the protection of the table lock, individual entry changes happen
+ * while holding that entry's spinlock.  The table lock nests outside the
+ * entry locks.  Allocations of new entries take the table lock as writers so
+ * no other lookups can happen while allocating new entries.  Entry updates
+ * take the table lock as readers so multiple entries can be updated in
+ * parallel.  An L2T entry can be dropped by decrementing its reference count
+ * and therefore can happen in parallel with entry allocation but no entry
+ * can change state or increment its ref count during allocation as both of
+ * these perform lookups.
+ */
+
+static inline unsigned int
+vlan_prio(const struct l2t_entry *e)
+{
+	return e->vlan >> 13;
+}
+
+static inline unsigned int
+arp_hash(u32 key, int ifindex, const struct l2t_data *d)
+{
+	return jhash_2words(key, ifindex, 0) & (d->nentries - 1);
+}
+
+static inline void
+neigh_replace(struct l2t_entry *e, struct rtentry *rt)
+{
+	RT_LOCK(rt);
+	RT_ADDREF(rt);
+	RT_UNLOCK(rt);
+	
+	if (e->neigh)
+		RTFREE(e->neigh);
+	e->neigh = rt;
+}
+
+/*
+ * Set up an L2T entry and send any packets waiting in the arp queue.  The
+ * supplied mbuf is used for the CPL_L2T_WRITE_REQ.  Must be called with the
+ * entry locked.
+ */
+static int
+setup_l2e_send_pending(struct t3cdev *dev, struct mbuf *m,
+    struct l2t_entry *e)
+{
+	struct cpl_l2t_write_req *req;
+
+	if (!m) {
+		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
+		    return (ENOMEM);
+	}
+	/*
+	 * XXX MH_ALIGN
+	 */
+	req = mtod(m, struct cpl_l2t_write_req *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx));
+	req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) |
+			    V_L2T_W_VLAN(e->vlan & EVL_VLID_MASK) |
+			    V_L2T_W_PRIO(vlan_prio(e)));
+
+	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
+	m_set_priority(m, CPL_PRIORITY_CONTROL);
+	cxgb_ofld_send(dev, m);
+	while (e->arpq_head) {
+		m = e->arpq_head;
+		e->arpq_head = m->m_next;
+		m->m_next = NULL;
+		cxgb_ofld_send(dev, m);
+	}
+	e->arpq_tail = NULL;
+	e->state = L2T_STATE_VALID;
+
+	return 0;
+}
+
+/*
+ * Add a packet to the an L2T entry's queue of packets awaiting resolution.
+ * Must be called with the entry's lock held.
+ */
+static inline void
+arpq_enqueue(struct l2t_entry *e, struct mbuf *m)
+{
+	m->m_next = NULL;
+	if (e->arpq_head)
+		e->arpq_tail->m_next = m;
+	else
+		e->arpq_head = m;
+	e->arpq_tail = m;
+}
+
+int
+t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e)
+{
+	struct rtentry *rt =  e->neigh;
+	struct sockaddr_in sin;
+
+	bzero(&sin, sizeof(struct sockaddr_in));
+	sin.sin_family = AF_INET;
+	sin.sin_len = sizeof(struct sockaddr_in);
+	sin.sin_addr.s_addr = e->addr;
+
+	CTR2(KTR_CXGB, "send slow on rt=%p eaddr=0x%08x\n", rt, e->addr);
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
+		arpresolve(rt->rt_ifp, rt, NULL,
+		     (struct sockaddr *)&sin, e->dmac);
+		mtx_lock(&e->lock);
+		if (e->state == L2T_STATE_STALE)
+			e->state = L2T_STATE_VALID;
+		mtx_unlock(&e->lock);
+	case L2T_STATE_VALID:     /* fast-path, send the packet on */
+		return cxgb_ofld_send(dev, m);
+	case L2T_STATE_RESOLVING:
+		mtx_lock(&e->lock);
+		if (e->state != L2T_STATE_RESOLVING) { // ARP already completed
+			mtx_unlock(&e->lock);
+			goto again;
+		}
+		arpq_enqueue(e, m);
+		mtx_unlock(&e->lock);
+		/*
+		 * Only the first packet added to the arpq should kick off
+		 * resolution.  However, because the m_gethdr below can fail,
+		 * we allow each packet added to the arpq to retry resolution
+		 * as a way of recovering from transient memory exhaustion.
+		 * A better way would be to use a work request to retry L2T
+		 * entries when there's no memory.
+		 */
+		if (arpresolve(rt->rt_ifp, rt, NULL,
+		     (struct sockaddr *)&sin, e->dmac) == 0) {
+			CTR6(KTR_CXGB, "mac=%x:%x:%x:%x:%x:%x\n",
+			    e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
+			
+			if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
+				return (ENOMEM);
+
+			mtx_lock(&e->lock);
+			if (e->arpq_head) 
+				setup_l2e_send_pending(dev, m, e);
+			else
+				m_freem(m);
+			mtx_unlock(&e->lock);
+		}
+	}
+	return 0;
+}
+
+void
+t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e)
+{
+	struct rtentry *rt;
+	struct mbuf *m0;
+	struct sockaddr_in sin;
+	sin.sin_family = AF_INET;
+	sin.sin_len = sizeof(struct sockaddr_in);
+	sin.sin_addr.s_addr = e->addr;
+	
+	if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
+		return;
+
+	rt = e->neigh;
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
+		arpresolve(rt->rt_ifp, rt, NULL,
+		     (struct sockaddr *)&sin, e->dmac);
+		mtx_lock(&e->lock);
+		if (e->state == L2T_STATE_STALE) {
+			e->state = L2T_STATE_VALID;
+		}
+		mtx_unlock(&e->lock);
+		return;
+	case L2T_STATE_VALID:     /* fast-path, send the packet on */
+		return;
+	case L2T_STATE_RESOLVING:
+		mtx_lock(&e->lock);
+		if (e->state != L2T_STATE_RESOLVING) { // ARP already completed
+			mtx_unlock(&e->lock);
+			goto again;
+		}
+		mtx_unlock(&e->lock);
+		
+		/*
+		 * Only the first packet added to the arpq should kick off
+		 * resolution.  However, because the alloc_skb below can fail,
+		 * we allow each packet added to the arpq to retry resolution
+		 * as a way of recovering from transient memory exhaustion.
+		 * A better way would be to use a work request to retry L2T
+		 * entries when there's no memory.
+		 */
+		arpresolve(rt->rt_ifp, rt, NULL,
+		    (struct sockaddr *)&sin, e->dmac);
+
+	}
+	return;
+}
+/*
+ * Allocate a free L2T entry.  Must be called with l2t_data.lock held.
+ */
+static struct l2t_entry *
+alloc_l2e(struct l2t_data *d)
+{
+	struct l2t_entry *end, *e, **p;
+
+	if (!atomic_load_acq_int(&d->nfree))
+		return NULL;
+
+	/* there's definitely a free entry */
+	for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e)
+		if (atomic_load_acq_int(&e->refcnt) == 0)
+			goto found;
+
+	for (e = &d->l2tab[1]; atomic_load_acq_int(&e->refcnt); ++e) ;
+found:
+	d->rover = e + 1;
+	atomic_add_int(&d->nfree, -1);
+
+	/*
+	 * The entry we found may be an inactive entry that is
+	 * presently in the hash table.  We need to remove it.
+	 */
+	if (e->state != L2T_STATE_UNUSED) {
+		int hash = arp_hash(e->addr, e->ifindex, d);
+
+		for (p = &d->l2tab[hash].first; *p; p = &(*p)->next)
+			if (*p == e) {
+				*p = e->next;
+				break;
+			}
+		e->state = L2T_STATE_UNUSED;
+	}
+	
+	return e;
+}
+
+/*
+ * Called when an L2T entry has no more users.  The entry is left in the hash
+ * table since it is likely to be reused but we also bump nfree to indicate
+ * that the entry can be reallocated for a different neighbor.  We also drop
+ * the existing neighbor reference in case the neighbor is going away and is
+ * waiting on our reference.
+ *
+ * Because entries can be reallocated to other neighbors once their ref count
+ * drops to 0 we need to take the entry's lock to avoid races with a new
+ * incarnation.
+ */
+void
+t3_l2e_free(struct l2t_data *d, struct l2t_entry *e)
+{
+	struct rtentry *rt = NULL;
+	
+	mtx_lock(&e->lock);
+	if (atomic_load_acq_int(&e->refcnt) == 0) {  /* hasn't been recycled */
+		rt = e->neigh;
+		e->neigh = NULL;
+	}
+	
+	mtx_unlock(&e->lock);
+	atomic_add_int(&d->nfree, 1);
+	if (rt)
+		RTFREE(rt);
+}
+
+
+/*
+ * Update an L2T entry that was previously used for the same next hop as neigh.
+ * Must be called with softirqs disabled.
+ */
+static inline void
+reuse_entry(struct l2t_entry *e, struct rtentry *neigh)
+{
+	struct llinfo_arp *la;
+
+	la = (struct llinfo_arp *)neigh->rt_llinfo; 
+
+	mtx_lock(&e->lock);                /* avoid race with t3_l2t_free */
+	if (neigh != e->neigh)
+		neigh_replace(e, neigh);
+	
+	if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), sizeof(e->dmac)) ||
+	    (neigh->rt_expire > time_uptime))
+		e->state = L2T_STATE_RESOLVING;
+	else if (la->la_hold == NULL)
+		e->state = L2T_STATE_VALID;
+	else
+		e->state = L2T_STATE_STALE;
+	mtx_unlock(&e->lock);
+}
+
+struct l2t_entry *
+t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh, struct ifnet *ifp,
+	struct sockaddr *sa)
+{
+	struct l2t_entry *e;
+	struct l2t_data *d = L2DATA(dev);
+	u32 addr = ((struct sockaddr_in *)sa)->sin_addr.s_addr;
+	int ifidx = neigh->rt_ifp->if_index;
+	int hash = arp_hash(addr, ifidx, d);
+	unsigned int smt_idx = ((struct port_info *)ifp->if_softc)->port_id;
+
+	rw_wlock(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (e->addr == addr && e->ifindex == ifidx &&
+		    e->smt_idx == smt_idx) {
+			l2t_hold(d, e);
+			if (atomic_load_acq_int(&e->refcnt) == 1)
+				reuse_entry(e, neigh);
+			goto done;
+		}
+
+	/* Need to allocate a new entry */
+	e = alloc_l2e(d);
+	if (e) {
+		mtx_lock(&e->lock);          /* avoid race with t3_l2t_free */
+		e->next = d->l2tab[hash].first;
+		d->l2tab[hash].first = e;
+		rw_wunlock(&d->lock);
+		
+		e->state = L2T_STATE_RESOLVING;
+		e->addr = addr;
+		e->ifindex = ifidx;
+		e->smt_idx = smt_idx;
+		atomic_store_rel_int(&e->refcnt, 1);
+		e->neigh = NULL;
+		
+		
+		neigh_replace(e, neigh);
+#ifdef notyet
+		/* 
+		 * XXX need to add accessor function for vlan tag
+		 */
+		if (neigh->rt_ifp->if_vlantrunk)
+			e->vlan = VLAN_DEV_INFO(neigh->dev)->vlan_id;
+		else
+#endif			    
+			e->vlan = VLAN_NONE;
+		mtx_unlock(&e->lock);
+
+		return (e);
+	}
+	
+done:
+	rw_wunlock(&d->lock);
+	return e;
+}
+
+/*
+ * Called when address resolution fails for an L2T entry to handle packets
+ * on the arpq head.  If a packet specifies a failure handler it is invoked,
+ * otherwise the packets is sent to the TOE.
+ *
+ * XXX: maybe we should abandon the latter behavior and just require a failure
+ * handler.
+ */
+static void
+handle_failed_resolution(struct t3cdev *dev, struct mbuf *arpq)
+{
+
+	while (arpq) {
+		struct mbuf *m = arpq;
+#ifdef notyet		
+		struct l2t_mbuf_cb *cb = L2T_MBUF_CB(m);
+#endif
+		arpq = m->m_next;
+		m->m_next = NULL;
+#ifdef notyet		
+		if (cb->arp_failure_handler)
+			cb->arp_failure_handler(dev, m);
+		else
+#endif			
+			cxgb_ofld_send(dev, m);
+	}
+
+}
+
+void
+t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh,
+    uint8_t *enaddr, struct sockaddr *sa)
+{
+	struct l2t_entry *e;
+	struct mbuf *arpq = NULL;
+	struct l2t_data *d = L2DATA(dev);
+	u32 addr = *(u32 *) &((struct sockaddr_in *)sa)->sin_addr;
+	int ifidx = neigh->rt_ifp->if_index;
+	int hash = arp_hash(addr, ifidx, d);
+	struct llinfo_arp *la;
+
+	rw_rlock(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (e->addr == addr && e->ifindex == ifidx) {
+			mtx_lock(&e->lock);
+			goto found;
+		}
+	rw_runlock(&d->lock);
+	CTR1(KTR_CXGB, "t3_l2t_update: addr=0x%08x not found", addr);
+	return;
+
+found:
+	printf("found 0x%08x\n", addr);
+
+	rw_runlock(&d->lock);
+	memcpy(e->dmac, enaddr, ETHER_ADDR_LEN);
+	printf("mac=%x:%x:%x:%x:%x:%x\n",
+	    e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
+	
+	if (atomic_load_acq_int(&e->refcnt)) {
+		if (neigh != e->neigh)
+			neigh_replace(e, neigh);
+		
+		la = (struct llinfo_arp *)neigh->rt_llinfo; 
+		if (e->state == L2T_STATE_RESOLVING) {
+			
+			if (la->la_asked >= 5 /* arp_maxtries */) {
+				arpq = e->arpq_head;
+				e->arpq_head = e->arpq_tail = NULL;
+			} else
+				setup_l2e_send_pending(dev, NULL, e);
+		} else {
+			e->state = L2T_STATE_VALID;
+			if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), 6))
+				setup_l2e_send_pending(dev, NULL, e);
+		}
+	}
+	mtx_unlock(&e->lock);
+
+	if (arpq)
+		handle_failed_resolution(dev, arpq);
+}
+
+struct l2t_data *
+t3_init_l2t(unsigned int l2t_capacity)
+{
+	struct l2t_data *d;
+	int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry);
+
+	d = cxgb_alloc_mem(size);
+	if (!d)
+		return NULL;
+
+	d->nentries = l2t_capacity;
+	d->rover = &d->l2tab[1];	/* entry 0 is not used */
+	atomic_store_rel_int(&d->nfree, l2t_capacity - 1);
+	rw_init(&d->lock, "L2T");
+
+	for (i = 0; i < l2t_capacity; ++i) {
+		d->l2tab[i].idx = i;
+		d->l2tab[i].state = L2T_STATE_UNUSED;
+		mtx_init(&d->l2tab[i].lock, "L2TAB", NULL, MTX_DEF);
+		atomic_store_rel_int(&d->l2tab[i].refcnt, 0);
+	}
+	return d;
+}
+
+void
+t3_free_l2t(struct l2t_data *d)
+{
+	int i;
+
+	rw_destroy(&d->lock);
+	for (i = 0; i < d->nentries; ++i) 
+		mtx_destroy(&d->l2tab[i].lock);
+
+	cxgb_free_mem(d);
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_l2t.h b/sys/dev/cxgb/ulp/tom/cxgb_l2t.h
new file mode 100644
index 0000000000000..3575f6fa98b14
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_l2t.h
@@ -0,0 +1,161 @@
+/**************************************************************************
+
+Copyright (c) 2007-2008, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef _CHELSIO_L2T_H
+#define _CHELSIO_L2T_H
+
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <sys/lock.h>
+
+#if __FreeBSD_version > 700000
+#include <sys/rwlock.h>
+#else
+#define rwlock mtx
+#define rw_wlock(x) mtx_lock((x))
+#define rw_wunlock(x) mtx_unlock((x))
+#define rw_rlock(x) mtx_lock((x))
+#define rw_runlock(x) mtx_unlock((x))
+#define rw_init(x, str) mtx_init((x), (str), NULL, MTX_DEF)
+#define rw_destroy(x) mtx_destroy((x))
+#endif
+
+enum {
+	L2T_STATE_VALID,      /* entry is up to date */
+	L2T_STATE_STALE,      /* entry may be used but needs revalidation */
+	L2T_STATE_RESOLVING,  /* entry needs address resolution */
+	L2T_STATE_UNUSED      /* entry not in use */
+};
+
+/*
+ * Each L2T entry plays multiple roles.  First of all, it keeps state for the
+ * corresponding entry of the HW L2 table and maintains a queue of offload
+ * packets awaiting address resolution.  Second, it is a node of a hash table
+ * chain, where the nodes of the chain are linked together through their next
+ * pointer.  Finally, each node is a bucket of a hash table, pointing to the
+ * first element in its chain through its first pointer.
+ */
+struct l2t_entry {
+	uint16_t state;               /* entry state */
+	uint16_t idx;                 /* entry index */
+	uint32_t addr;                /* dest IP address */
+	int ifindex;                  /* neighbor's net_device's ifindex */
+	uint16_t smt_idx;             /* SMT index */
+	uint16_t vlan;                /* VLAN TCI (id: bits 0-11, prio: 13-15 */
+	struct rtentry *neigh;        /* associated neighbour */
+	struct l2t_entry *first;      /* start of hash chain */
+	struct l2t_entry *next;       /* next l2t_entry on chain */
+	struct mbuf *arpq_head;       /* queue of packets awaiting resolution */
+	struct mbuf *arpq_tail;
+	struct mtx lock;
+	volatile uint32_t refcnt;     /* entry reference count */
+	uint8_t dmac[6];              /* neighbour's MAC address */
+};
+
+struct l2t_data {
+	unsigned int nentries;      /* number of entries */
+	struct l2t_entry *rover;    /* starting point for next allocation */
+	volatile uint32_t nfree;    /* number of free entries */
+	struct rwlock lock;
+	struct l2t_entry l2tab[0];
+};
+
+typedef void (*arp_failure_handler_func)(struct t3cdev *dev,
+					 struct mbuf *m);
+
+typedef void (*opaque_arp_failure_handler_func)(void *dev,
+					 struct mbuf *m);
+
+/*
+ * Callback stored in an skb to handle address resolution failure.
+ */
+struct l2t_mbuf_cb {
+	arp_failure_handler_func arp_failure_handler;
+};
+
+/*
+ * XXX 
+ */
+#define L2T_MBUF_CB(skb) ((struct l2t_mbuf_cb *)(skb)->cb)
+
+
+static __inline void set_arp_failure_handler(struct mbuf *m,
+					   arp_failure_handler_func hnd)
+{
+	m->m_pkthdr.header = (opaque_arp_failure_handler_func)hnd;
+
+}
+
+/*
+ * Getting to the L2 data from an offload device.
+ */
+#define L2DATA(dev) ((dev)->l2opt)
+
+void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e);
+void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa);
+struct l2t_entry *t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh,
+    struct ifnet *ifp, struct sockaddr *sa);
+int t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m,
+		     struct l2t_entry *e);
+void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e);
+struct l2t_data *t3_init_l2t(unsigned int l2t_capacity);
+void t3_free_l2t(struct l2t_data *d);
+
+#ifdef CONFIG_PROC_FS
+int t3_l2t_proc_setup(struct proc_dir_entry *dir, struct l2t_data *d);
+void t3_l2t_proc_free(struct proc_dir_entry *dir);
+#else
+#define l2t_proc_setup(dir, d) 0
+#define l2t_proc_free(dir)
+#endif
+
+int cxgb_ofld_send(struct t3cdev *dev, struct mbuf *m);
+
+static inline int l2t_send(struct t3cdev *dev, struct mbuf *m,
+			   struct l2t_entry *e)
+{
+	if (__predict_true(e->state == L2T_STATE_VALID)) {
+		return cxgb_ofld_send(dev, (struct mbuf *)m);
+	}
+	return t3_l2t_send_slow(dev, (struct mbuf *)m, e);
+}
+
+static inline void l2t_release(struct l2t_data *d, struct l2t_entry *e)
+{
+	if (atomic_fetchadd_int(&e->refcnt, -1) == 1)
+		t3_l2e_free(d, e);
+}
+
+static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e)
+{
+	if (atomic_fetchadd_int(&e->refcnt, 1) == 1)  /* 0 -> 1 transition */
+		atomic_add_int(&d->nfree, 1);
+}
+
+#endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
new file mode 100644
index 0000000000000..1d15cf292dcd3
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -0,0 +1,338 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+
+#include <netinet/tcp_offload.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+
+
+static struct listen_info *listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid);
+static int listen_hash_del(struct tom_data *d, struct socket *so);
+
+/*
+ * Process a CPL_CLOSE_LISTSRV_RPL message.  If the status is good we release
+ * the STID.
+ */
+static int
+do_close_server_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	struct cpl_close_listserv_rpl *rpl = cplhdr(m);
+	unsigned int stid = GET_TID(rpl);
+
+	if (rpl->status != CPL_ERR_NONE)
+		log(LOG_ERR, "Unexpected CLOSE_LISTSRV_RPL status %u for "
+		       "STID %u\n", rpl->status, stid);
+	else {
+		struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+
+		cxgb_free_stid(cdev, stid);
+		free(listen_ctx, M_CXGB);
+	}
+
+	return (CPL_RET_BUF_DONE);
+}
+
+/*
+ * Process a CPL_PASS_OPEN_RPL message.  Remove the socket from the listen hash
+ * table and free the STID if there was any error, otherwise nothing to do.
+ */
+static int
+do_pass_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+       	struct cpl_pass_open_rpl *rpl = cplhdr(m);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		int stid = GET_TID(rpl);
+		struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
+		struct tom_data *d = listen_ctx->tom_data;
+		struct socket *lso = listen_ctx->lso;
+
+#if VALIDATE_TID
+		if (!lso)
+			return (CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE);
+#endif
+		/*
+		 * Note: It is safe to unconditionally call listen_hash_del()
+		 * at this point without risking unhashing a reincarnation of
+		 * an already closed socket (i.e., there is no listen, close,
+		 * listen, free the sock for the second listen while processing
+		 * a message for the first race) because we are still holding
+		 * a reference on the socket.  It is possible that the unhash
+		 * will fail because the socket is already closed, but we can't
+		 * unhash the wrong socket because it is impossible for the
+		 * socket to which this message refers to have reincarnated.
+		 */
+		listen_hash_del(d, lso);
+		cxgb_free_stid(cdev, stid);
+#ifdef notyet
+		/*
+		 * XXX need to unreference the inpcb
+		 * but we have no way of knowing that other TOMs aren't referencing it 
+		 */
+		sock_put(lso);
+#endif
+		free(listen_ctx, M_CXGB);
+	}
+	return CPL_RET_BUF_DONE;
+}
+
+void
+t3_init_listen_cpl_handlers(void)
+{
+	t3tom_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
+	t3tom_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
+}
+
+static inline int
+listen_hashfn(const struct socket *so)
+{
+	return ((unsigned long)so >> 10) & (LISTEN_INFO_HASH_SIZE - 1);
+}
+
+/*
+ * Create and add a listen_info entry to the listen hash table.  This and the
+ * listen hash table functions below cannot be called from softirqs.
+ */
+static struct listen_info *
+listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid)
+{
+	struct listen_info *p;
+
+	p = malloc(sizeof(*p), M_CXGB, M_NOWAIT|M_ZERO);
+	if (p) {
+		int bucket = listen_hashfn(so);
+
+		p->so = so;	/* just a key, no need to take a reference */
+		p->stid = stid;
+		mtx_lock(&d->listen_lock);		
+		p->next = d->listen_hash_tab[bucket];
+		d->listen_hash_tab[bucket] = p;
+		mtx_unlock(&d->listen_lock);
+	}
+	return p;
+}
+
+/*
+ * Given a pointer to a listening socket return its server TID by consulting
+ * the socket->stid map.  Returns -1 if the socket is not in the map.
+ */
+static int
+listen_hash_find(struct tom_data *d, struct socket *so)
+{
+	int stid = -1, bucket = listen_hashfn(so);
+	struct listen_info *p;
+
+	mtx_lock(&d->listen_lock);
+	for (p = d->listen_hash_tab[bucket]; p; p = p->next)
+		if (p->so == so) {
+			stid = p->stid;
+			break;
+		}
+	mtx_unlock(&d->listen_lock);
+	return stid;
+}
+
+/*
+ * Delete the listen_info structure for a listening socket.  Returns the server
+ * TID for the socket if it is present in the socket->stid map, or -1.
+ */
+static int
+listen_hash_del(struct tom_data *d, struct socket *so)
+{
+	int bucket, stid = -1;
+	struct listen_info *p, **prev;
+
+	bucket = listen_hashfn(so);
+	prev  = &d->listen_hash_tab[bucket];
+
+	mtx_lock(&d->listen_lock);
+	for (p = *prev; p; prev = &p->next, p = p->next)
+		if (p->so == so) {
+			stid = p->stid;
+			*prev = p->next;
+			free(p, M_CXGB);
+			break;
+		}
+	mtx_unlock(&d->listen_lock);
+	
+	return (stid);
+}
+
+/*
+ * Start a listening server by sending a passive open request to HW.
+ */
+void
+t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
+{
+	int stid;
+	struct mbuf *m;
+	struct cpl_pass_open_req *req;
+	struct tom_data *d = TOM_DATA(dev);
+	struct inpcb *inp = sotoinpcb(so);
+	struct listen_ctx *ctx;
+
+	if (!TOM_TUNABLE(dev, activated))
+		return;
+
+	if (listen_hash_find(d, so) != -1)
+		return;
+	
+	CTR1(KTR_TOM, "start listen on port %u", ntohs(inp->inp_lport));
+	ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT|M_ZERO);
+
+	if (!ctx)
+		return;
+
+	ctx->tom_data = d;
+	ctx->lso = so;
+	ctx->ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) ? ULP_MODE_TCPDDP : 0;
+	LIST_INIT(&ctx->synq_head);
+	
+	stid = cxgb_alloc_stid(d->cdev, d->client, ctx);
+	if (stid < 0)
+		goto free_ctx;
+
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (m == NULL)
+		goto free_stid;
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	
+	if (!listen_hash_add(d, so, stid))
+		goto free_all;
+
+	req = mtod(m, struct cpl_pass_open_req *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid));
+	req->local_port = inp->inp_lport; 
+	memcpy(&req->local_ip, &inp->inp_laddr, 4);
+	req->peer_port = 0;
+	req->peer_ip = 0;
+	req->peer_netmask = 0;
+	req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
+	req->opt0l = htonl(V_RCV_BUFSIZ(16));
+	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
+
+	m_set_priority(m, CPL_PRIORITY_LISTEN); 
+	cxgb_ofld_send(cdev, m);
+	return;
+
+free_all:
+	m_free(m);
+free_stid:
+	cxgb_free_stid(cdev, stid);
+#if 0	
+	sock_put(sk);
+#endif	
+free_ctx:
+	free(ctx, M_CXGB);
+}
+
+/*
+ * Stop a listening server by sending a close_listsvr request to HW.
+ * The server TID is freed when we get the reply.
+ */
+void
+t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev)
+{
+	struct mbuf *m;
+	struct cpl_close_listserv_req *req;
+	struct listen_ctx *lctx;
+	int stid = listen_hash_del(TOM_DATA(dev), so);
+	
+	if (stid < 0)
+		return;
+
+	lctx = cxgb_get_lctx(cdev, stid);
+	/*
+	 * Do this early so embryonic connections are marked as being aborted
+	 * while the stid is still open.  This ensures pass_establish messages
+	 * that arrive while we are closing the server will be able to locate
+	 * the listening socket.
+	 */
+	t3_reset_synq(lctx);
+
+	/* Send the close ASAP to stop further passive opens */
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (m == NULL) {
+		/*
+		 * XXX allocate from lowmem cache
+		 */
+	}
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+
+	req = mtod(m, struct cpl_close_listserv_req *);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid));
+	req->cpu_idx = 0;
+	m_set_priority(m, CPL_PRIORITY_LISTEN);
+	cxgb_ofld_send(cdev, m);
+
+	t3_disconnect_acceptq(so);
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
new file mode 100644
index 0000000000000..2cbfa7b38b28f
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h
@@ -0,0 +1,181 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+
+#ifndef T3_DDP_H
+#define T3_DDP_H
+
+/* Should be 1 or 2 indicating single or double kernel buffers. */
+#define NUM_DDP_KBUF 2
+
+/* min receive window for a connection to be considered for DDP */
+#define MIN_DDP_RCV_WIN (48 << 10)
+
+/* amount of Rx window not available to DDP to avoid window exhaustion */
+#define DDP_RSVD_WIN (16 << 10)
+
+/* # of sentinel invalid page pods at the end of a group of valid page pods */
+#define NUM_SENTINEL_PPODS 0
+
+/* # of pages a pagepod can hold without needing another pagepod */
+#define PPOD_PAGES 4
+
+/* page pods are allocated in groups of this size (must be power of 2) */
+#define PPOD_CLUSTER_SIZE 16
+
+/* for each TID we reserve this many page pods up front */
+#define RSVD_PPODS_PER_TID 1
+
+struct pagepod {
+	uint32_t pp_vld_tid;
+	uint32_t pp_pgsz_tag_color;
+	uint32_t pp_max_offset;
+	uint32_t pp_page_offset;
+	uint64_t pp_rsvd;
+	uint64_t pp_addr[5];
+};
+
+#define PPOD_SIZE sizeof(struct pagepod)
+
+#define S_PPOD_TID    0
+#define M_PPOD_TID    0xFFFFFF
+#define V_PPOD_TID(x) ((x) << S_PPOD_TID)
+
+#define S_PPOD_VALID    24
+#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID)
+#define F_PPOD_VALID    V_PPOD_VALID(1U)
+
+#define S_PPOD_COLOR    0
+#define M_PPOD_COLOR    0x3F
+#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR)
+
+#define S_PPOD_TAG    6
+#define M_PPOD_TAG    0xFFFFFF
+#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG)
+
+#define S_PPOD_PGSZ    30
+#define M_PPOD_PGSZ    0x3
+#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <machine/bus.h>
+
+/* DDP gather lists can specify an offset only for the first page. */
+struct ddp_gather_list {
+	unsigned int	dgl_length;
+	unsigned int	dgl_offset;
+	unsigned int	dgl_nelem;
+	vm_page_t   	dgl_pages[0];
+};
+
+struct ddp_buf_state {
+	unsigned int cur_offset;     /* offset of latest DDP notification */
+	unsigned int flags;
+	struct ddp_gather_list *gl;
+};
+
+struct ddp_state {
+	struct ddp_buf_state buf_state[2];   /* per buffer state */
+	int cur_buf;
+	unsigned short kbuf_noinval;
+	unsigned short kbuf_idx;        /* which HW buffer is used for kbuf */
+	struct ddp_gather_list *ubuf;
+	int user_ddp_pending;
+	unsigned int ubuf_nppods;       /* # of page pods for buffer 1 */
+	unsigned int ubuf_tag;
+	unsigned int ubuf_ddp_ready;
+	int cancel_ubuf;
+	int get_tcb_count;
+	unsigned int kbuf_posted;
+	unsigned int kbuf_nppods[NUM_DDP_KBUF];
+	unsigned int kbuf_tag[NUM_DDP_KBUF];
+	struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */
+};
+
+/* buf_state flags */
+enum {
+	DDP_BF_NOINVAL = 1 << 0,   /* buffer is set to NO_INVALIDATE */
+	DDP_BF_NOCOPY  = 1 << 1,   /* DDP to final dest, no copy needed */
+	DDP_BF_NOFLIP  = 1 << 2,   /* buffer flips after GET_TCB_RPL */
+	DDP_BF_PSH     = 1 << 3,   /* set in skb->flags if the a DDP was 
+	                              completed with a segment having the
+				      PSH flag set */
+	DDP_BF_NODATA  = 1 << 4,   /* buffer completed before filling */ 
+};
+
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+struct sockbuf;
+
+/*
+ * Returns 1 if a UBUF DMA buffer might be active.
+ */
+static inline int
+t3_ddp_ubuf_pending(struct toepcb *toep)
+{
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+	/* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP,
+	 * but DDP_STATE() is only valid if the connection actually enabled
+	 * DDP.
+	 */
+	if (p->kbuf[0] == NULL)
+		return (0);
+
+	return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) || 
+	       (p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY));
+}
+
+int t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
+		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
+		   unsigned int pg_off, unsigned int color);
+int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag);
+void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
+void t3_free_ddp_gl(struct ddp_gather_list *gl);
+int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len);
+//void t3_repost_kbuf(struct socket *so, int modulate, int activate);
+void t3_post_kbuf(struct toepcb *toep, int modulate, int nonblock);
+int t3_post_ubuf(struct toepcb *toep, const struct uio *uio, int nonblock,
+		 int rcv_flags, int modulate, int post_kbuf);
+void t3_cancel_ubuf(struct toepcb *toep, struct sockbuf *rcv);
+int t3_overlay_ubuf(struct toepcb *toep, struct sockbuf *rcv,
+    const struct uio *uio, int nonblock,
+    int rcv_flags, int modulate, int post_kbuf);
+int t3_enter_ddp(struct toepcb *toep, unsigned int kbuf_size, unsigned int waitall, int nonblock);
+void t3_cleanup_ddp(struct toepcb *toep);
+void t3_release_ddp_resources(struct toepcb *toep);
+void t3_cancel_ddpbuf(struct toepcb *, unsigned int bufidx);
+void t3_overlay_ddpbuf(struct toepcb *, unsigned int bufidx, unsigned int tag0,
+		       unsigned int tag1, unsigned int len);
+void t3_setup_ddpbufs(struct toepcb *, unsigned int len0, unsigned int offset0,
+		      unsigned int len1, unsigned int offset1,
+		      uint64_t ddp_flags, uint64_t flag_mask, int modulate);
+#endif  /* T3_DDP_H */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp.h b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h
new file mode 100644
index 0000000000000..3042ef00b0f1b
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h
@@ -0,0 +1,47 @@
+
+/*-
+ * Copyright (c) 2007, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef CXGB_TCP_H_
+#define CXGB_TCP_H_
+#ifdef TCP_USRREQS_OVERLOAD
+struct tcpcb *cxgb_tcp_drop(struct tcpcb *tp, int errno);
+#else
+#define cxgb_tcp_drop	tcp_drop
+#endif
+void cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip);
+struct tcpcb *cxgb_tcp_close(struct tcpcb *tp);
+
+extern struct pr_usrreqs cxgb_tcp_usrreqs;
+#ifdef INET6
+extern struct pr_usrreqs cxgb_tcp6_usrreqs;
+#endif
+
+#include <sys/sysctl.h>
+SYSCTL_DECL(_net_inet_tcp_cxgb);
+#endif  /* CXGB_TCP_H_ */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c
new file mode 100644
index 0000000000000..b61e1aca2c9ea
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c
@@ -0,0 +1,95 @@
+/*-
+ * Copyright (c) 2007, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * grab bag of accessor routines that will either be moved to netinet
+ * or removed
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_offload.h>
+#include <netinet/tcp_syncache.h>
+#include <netinet/toedev.h>
+
+#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
+
+
+/*
+ * This file contains code as a short-term staging area before it is moved in 
+ * to sys/netinet/tcp_offload.c
+ */
+
+void
+sockbuf_lock(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK(sb);
+}
+
+void
+sockbuf_lock_assert(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+}
+
+void
+sockbuf_unlock(struct sockbuf *sb)
+{
+
+	SOCKBUF_UNLOCK(sb);
+}
+
+int
+sockbuf_sbspace(struct sockbuf *sb)
+{
+
+	return (sbspace(sb));
+}
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h
new file mode 100644
index 0000000000000..bf0568c5e7c94
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h
@@ -0,0 +1,155 @@
+/* $FreeBSD$ */
+
+#ifndef CXGB_TCP_OFFLOAD_H_
+#define CXGB_TCP_OFFLOAD_H_
+
+struct socket;
+struct sockbuf;
+
+void sockbuf_lock(struct sockbuf *);
+void sockbuf_lock_assert(struct sockbuf *);
+void sockbuf_unlock(struct sockbuf *);
+int  sockbuf_sbspace(struct sockbuf *);
+
+
+#ifndef _SYS_SOCKETVAR_H_
+#include <sys/selinfo.h>
+#include <sys/sx.h>
+
+/*
+ * Constants for sb_flags field of struct sockbuf.
+ */
+#define	SB_MAX		(256*1024)	/* default for max chars in sockbuf */
+/*
+ * Constants for sb_flags field of struct sockbuf.
+ */
+#define	SB_WAIT		0x04		/* someone is waiting for data/space */
+#define	SB_SEL		0x08		/* someone is selecting */
+#define	SB_ASYNC	0x10		/* ASYNC I/O, need signals */
+#define	SB_UPCALL	0x20		/* someone wants an upcall */
+#define	SB_NOINTR	0x40		/* operations not interruptible */
+#define	SB_AIO		0x80		/* AIO operations queued */
+#define	SB_KNOTE	0x100		/* kernel note attached */
+#define	SB_NOCOALESCE	0x200		/* don't coalesce new data into existing mbufs */
+#define	SB_IN_TOE	0x400		/* socket buffer is in the middle of an operation */
+#define	SB_AUTOSIZE	0x800		/* automatically size socket buffer */
+
+
+struct sockbuf {
+	struct	selinfo sb_sel;	/* process selecting read/write */
+	struct	mtx sb_mtx;	/* sockbuf lock */
+	struct	sx sb_sx;	/* prevent I/O interlacing */
+	short	sb_state;	/* (c/d) socket state on sockbuf */
+#define	sb_startzero	sb_mb
+	struct	mbuf *sb_mb;	/* (c/d) the mbuf chain */
+	struct	mbuf *sb_mbtail; /* (c/d) the last mbuf in the chain */
+	struct	mbuf *sb_lastrecord;	/* (c/d) first mbuf of last
+						 * record in socket buffer */
+	struct	mbuf *sb_sndptr; /* (c/d) pointer into mbuf chain */
+	u_int	sb_sndptroff;	/* (c/d) byte offset of ptr into chain */
+	u_int	sb_cc;		/* (c/d) actual chars in buffer */
+	u_int	sb_hiwat;	/* (c/d) max actual char count */
+	u_int	sb_mbcnt;	/* (c/d) chars of mbufs used */
+	u_int	sb_mbmax;	/* (c/d) max chars of mbufs to use */
+	u_int	sb_ctl;		/* (c/d) non-data chars in buffer */
+	int	sb_lowat;	/* (c/d) low water mark */
+	int	sb_timeo;	/* (c/d) timeout for read/write */
+	short	sb_flags;	/* (c/d) flags, see below */
+};
+
+void	sbappend(struct sockbuf *sb, struct mbuf *m);
+void	sbappend_locked(struct sockbuf *sb, struct mbuf *m);
+void	sbappendstream(struct sockbuf *sb, struct mbuf *m);
+void	sbappendstream_locked(struct sockbuf *sb, struct mbuf *m);
+void	sbdrop(struct sockbuf *sb, int len);
+void	sbdrop_locked(struct sockbuf *sb, int len);
+void	sbdroprecord(struct sockbuf *sb);
+void	sbdroprecord_locked(struct sockbuf *sb);
+void	sbflush(struct sockbuf *sb);
+void	sbflush_locked(struct sockbuf *sb);
+int	sbwait(struct sockbuf *sb);
+int	sblock(struct sockbuf *, int);
+void	sbunlock(struct sockbuf *);
+
+
+
+/* adjust counters in sb reflecting allocation of m */
+#define	sballoc(sb, m) { \
+	(sb)->sb_cc += (m)->m_len; \
+	if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \
+		(sb)->sb_ctl += (m)->m_len; \
+	(sb)->sb_mbcnt += MSIZE; \
+	if ((m)->m_flags & M_EXT) \
+		(sb)->sb_mbcnt += (m)->m_ext.ext_size; \
+}
+
+/* adjust counters in sb reflecting freeing of m */
+#define	sbfree(sb, m) { \
+	(sb)->sb_cc -= (m)->m_len; \
+	if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \
+		(sb)->sb_ctl -= (m)->m_len; \
+	(sb)->sb_mbcnt -= MSIZE; \
+	if ((m)->m_flags & M_EXT) \
+		(sb)->sb_mbcnt -= (m)->m_ext.ext_size; \
+	if ((sb)->sb_sndptr == (m)) { \
+		(sb)->sb_sndptr = NULL; \
+		(sb)->sb_sndptroff = 0; \
+	} \
+	if ((sb)->sb_sndptroff != 0) \
+		(sb)->sb_sndptroff -= (m)->m_len; \
+}
+
+#define	SS_NOFDREF		0x0001	/* no file table ref any more */
+#define	SS_ISCONNECTED		0x0002	/* socket connected to a peer */
+#define	SS_ISCONNECTING		0x0004	/* in process of connecting to peer */
+#define	SS_ISDISCONNECTING	0x0008	/* in process of disconnecting */
+#define	SS_NBIO			0x0100	/* non-blocking ops */
+#define	SS_ASYNC		0x0200	/* async i/o notify */
+#define	SS_ISCONFIRMING		0x0400	/* deciding to accept connection req */
+#define	SS_ISDISCONNECTED	0x2000	/* socket disconnected from peer */
+/*
+ * Protocols can mark a socket as SS_PROTOREF to indicate that, following
+ * pru_detach, they still want the socket to persist, and will free it
+ * themselves when they are done.  Protocols should only ever call sofree()
+ * following setting this flag in pru_detach(), and never otherwise, as
+ * sofree() bypasses socket reference counting.
+ */
+#define	SS_PROTOREF		0x4000	/* strong protocol reference */
+
+/*
+ * Socket state bits now stored in the socket buffer state field.
+ */
+#define	SBS_CANTSENDMORE	0x0010	/* can't send more data to peer */
+#define	SBS_CANTRCVMORE		0x0020	/* can't receive more data from peer */
+#define	SBS_RCVATMARK		0x0040	/* at mark on input */
+
+
+
+enum sopt_dir { SOPT_GET, SOPT_SET };
+struct sockopt {
+	enum	sopt_dir sopt_dir; /* is this a get or a set? */
+	int	sopt_level;	/* second arg of [gs]etsockopt */
+	int	sopt_name;	/* third arg of [gs]etsockopt */
+	void   *sopt_val;	/* fourth arg of [gs]etsockopt */
+	size_t	sopt_valsize;	/* (almost) fifth arg of [gs]etsockopt */
+	struct	thread *sopt_td; /* calling thread or null if kernel */
+};
+
+
+int	sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen);
+int	sooptcopyout(struct sockopt *sopt, const void *buf, size_t len);
+
+
+void	soisconnected(struct socket *so);
+void	soisconnecting(struct socket *so);
+void	soisdisconnected(struct socket *so);
+void	soisdisconnecting(struct socket *so);
+void	socantrcvmore(struct socket *so);
+void	socantrcvmore_locked(struct socket *so);
+void	socantsendmore(struct socket *so);
+void	socantsendmore_locked(struct socket *so);
+
+#endif /* !NET_CORE */
+
+
+#endif /* CXGB_TCP_OFFLOAD_H_ */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
new file mode 100644
index 0000000000000..7c4bd0c06c414
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2007-2008, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef CXGB_TOEPCB_H_
+#define CXGB_TOEPCB_H_
+#include <sys/bus.h>
+#include <sys/condvar.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+struct toepcb {
+	struct toedev 		*tp_toedev;
+	struct l2t_entry 	*tp_l2t;
+	unsigned int 		tp_tid;
+	int 			tp_wr_max;
+	int 			tp_wr_avail;
+	int 			tp_wr_unacked;
+	int 			tp_delack_mode;
+	int 			tp_mtu_idx;
+	int 			tp_ulp_mode;
+	int 			tp_qset_idx;
+	int 			tp_mss_clamp;
+	int 			tp_qset;
+	int 			tp_flags;
+	int 			tp_enqueued_bytes;
+	int 			tp_page_count;
+	int 			tp_state;
+
+	tcp_seq 		tp_iss;
+	tcp_seq 		tp_delack_seq;
+	tcp_seq 		tp_rcv_wup;
+	tcp_seq 		tp_copied_seq;
+	uint64_t 		tp_write_seq;
+
+	volatile int 		tp_refcount;
+	vm_page_t 		*tp_pages;
+	
+	struct tcpcb 		*tp_tp;
+	struct mbuf  		*tp_m_last;
+	bus_dma_tag_t		tp_tx_dmat;
+	bus_dma_tag_t		tp_rx_dmat;
+	bus_dmamap_t		tp_dmamap;
+
+	LIST_ENTRY(toepcb) 	synq_entry;
+	struct mbuf_head 	wr_list;
+	struct mbuf_head 	out_of_order_queue;
+	struct ddp_state 	tp_ddp_state;
+	struct cv		tp_cv;
+			   
+};
+
+static inline void
+reset_wr_list(struct toepcb *toep)
+{
+
+	mbufq_init(&toep->wr_list);
+}
+
+static inline void
+purge_wr_queue(struct toepcb *toep)
+{
+	struct mbuf *m;
+	
+	while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) 
+		m_freem(m);
+}
+
+static inline void
+enqueue_wr(struct toepcb *toep, struct mbuf *m)
+{
+
+	mbufq_tail(&toep->wr_list, m);
+}
+
+static inline struct mbuf *
+peek_wr(const struct toepcb *toep)
+{
+
+	return (mbufq_peek(&toep->wr_list));
+}
+
+static inline struct mbuf *
+dequeue_wr(struct toepcb *toep)
+{
+
+	return (mbufq_dequeue(&toep->wr_list));
+}
+
+#define wr_queue_walk(toep, m) \
+	for (m = peek_wr(toep); m; m = m->m_nextpkt)
+
+
+
+#endif
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
new file mode 100644
index 0000000000000..751b1cd0b051e
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
@@ -0,0 +1,1510 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/eventhandler.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/condvar.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/taskqueue.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/in_pcb.h>
+
+#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_offload.h>
+#include <netinet/tcp_fsm.h>
+
+#ifdef CONFIG_DEFINED
+#include <cxgb_include.h>
+#else
+#include <dev/cxgb/cxgb_include.h>
+#endif
+
+#include <net/if_vlan_var.h>
+#include <net/route.h>
+
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+
+
+
+
+
+static int activated = 1;
+TUNABLE_INT("hw.t3toe.activated", &activated);
+SYSCTL_NODE(_hw, OID_AUTO, t3toe, CTLFLAG_RD, 0, "T3 toe driver parameters");
+SYSCTL_UINT(_hw_t3toe, OID_AUTO, activated, CTLFLAG_RDTUN, &activated, 0,
+    "enable TOE at init time");
+
+
+TAILQ_HEAD(, adapter) adapter_list;
+static struct rwlock adapter_list_lock;
+
+static TAILQ_HEAD(, tom_data) cxgb_list;
+static struct mtx cxgb_list_lock;
+static const unsigned int MAX_ATIDS = 64 * 1024;
+static const unsigned int ATID_BASE = 0x100000;
+
+static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry);
+static void cxgb_register_listeners(void);
+static void t3c_tom_add(struct t3cdev *cdev);
+
+/*
+ * Handlers for each CPL opcode
+ */
+static cxgb_cpl_handler_func tom_cpl_handlers[256];
+
+
+static eventhandler_tag listen_tag;
+
+static struct offload_id t3_toe_id_tab[] = {
+	{ TOE_ID_CHELSIO_T3, 0 },
+	{ TOE_ID_CHELSIO_T3B, 0 },
+	{ TOE_ID_CHELSIO_T3C, 0 },
+	{ 0 }
+};
+
+static struct tom_info t3_tom_info = {
+	.ti_attach = t3_toe_attach,
+	.ti_id_table = t3_toe_id_tab,
+	.ti_name = "Chelsio-T3"
+};
+
+struct cxgb_client t3c_tom_client = {
+	.name = "tom_cxgb3",
+	.add = t3c_tom_add,
+	.remove = NULL,
+	.handlers = tom_cpl_handlers,
+	.redirect = NULL
+};
+
+/*
+ * Add an skb to the deferred skb queue for processing from process context.
+ */
+void
+t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler)
+{
+	struct tom_data *td = TOM_DATA(dev);
+
+	m_set_handler(m, handler);
+	mtx_lock(&td->deferq.lock);
+	
+	mbufq_tail(&td->deferq, m);
+	if (mbufq_len(&td->deferq) == 1)
+		taskqueue_enqueue(td->tq, &td->deferq_task);
+	mtx_lock(&td->deferq.lock);
+}
+
+struct toepcb *
+toepcb_alloc(void)
+{
+	struct toepcb *toep;
+	
+	toep = malloc(sizeof(struct toepcb), M_CXGB, M_NOWAIT|M_ZERO);
+	
+	if (toep == NULL)
+		return (NULL);
+
+	toepcb_init(toep);
+	return (toep);
+}
+
+void
+toepcb_init(struct toepcb *toep)
+{
+	toep->tp_refcount = 1;
+	cv_init(&toep->tp_cv, "toep cv");
+}
+
+void
+toepcb_hold(struct toepcb *toep)
+{
+	atomic_add_acq_int(&toep->tp_refcount, 1);
+}
+
+void
+toepcb_release(struct toepcb *toep)
+{
+	if (toep->tp_refcount == 1) {
+		free(toep, M_CXGB);
+		return;
+	}
+	atomic_add_acq_int(&toep->tp_refcount, -1);
+}
+
+
+/*
+ * Add a T3 offload device to the list of devices we are managing.
+ */
+static void
+t3cdev_add(struct tom_data *t)
+{	
+	mtx_lock(&cxgb_list_lock);
+	TAILQ_INSERT_TAIL(&cxgb_list, t, entry);
+	mtx_unlock(&cxgb_list_lock);
+}
+
+static inline int
+cdev2type(struct t3cdev *cdev)
+{
+	int type = 0;
+
+	switch (cdev->type) {
+	case T3A:
+		type = TOE_ID_CHELSIO_T3;
+		break;
+	case T3B:
+		type = TOE_ID_CHELSIO_T3B;
+		break;
+	case T3C:
+		type = TOE_ID_CHELSIO_T3C;
+		break;
+	}
+	return (type);
+}
+
+/*
+ * Allocate and initialize the TID tables.  Returns 0 on success.
+ */
+static int
+init_tid_tabs(struct tid_info *t, unsigned int ntids,
+			 unsigned int natids, unsigned int nstids,
+			 unsigned int atid_base, unsigned int stid_base)
+{
+	unsigned long size = ntids * sizeof(*t->tid_tab) +
+	    natids * sizeof(*t->atid_tab) + nstids * sizeof(*t->stid_tab);
+
+	t->tid_tab = cxgb_alloc_mem(size);
+	if (!t->tid_tab)
+		return (ENOMEM);
+
+	t->stid_tab = (union listen_entry *)&t->tid_tab[ntids];
+	t->atid_tab = (union active_open_entry *)&t->stid_tab[nstids];
+	t->ntids = ntids;
+	t->nstids = nstids;
+	t->stid_base = stid_base;
+	t->sfree = NULL;
+	t->natids = natids;
+	t->atid_base = atid_base;
+	t->afree = NULL;
+	t->stids_in_use = t->atids_in_use = 0;
+	atomic_set_int(&t->tids_in_use, 0);
+	mtx_init(&t->stid_lock, "stid", NULL, MTX_DUPOK|MTX_DEF);
+	mtx_init(&t->atid_lock, "atid", NULL, MTX_DUPOK|MTX_DEF);
+
+	/*
+	 * Setup the free lists for stid_tab and atid_tab.
+	 */
+	if (nstids) {
+		while (--nstids)
+			t->stid_tab[nstids - 1].next = &t->stid_tab[nstids];
+		t->sfree = t->stid_tab;
+	}
+	if (natids) {
+		while (--natids)
+			t->atid_tab[natids - 1].next = &t->atid_tab[natids];
+		t->afree = t->atid_tab;
+	}
+	return 0;
+}
+
+static void
+free_tid_maps(struct tid_info *t)
+{
+	mtx_destroy(&t->stid_lock);
+	mtx_destroy(&t->atid_lock);
+	cxgb_free_mem(t->tid_tab);
+}
+
+static inline void
+add_adapter(adapter_t *adap)
+{
+	rw_wlock(&adapter_list_lock);
+	TAILQ_INSERT_TAIL(&adapter_list, adap, adapter_entry);
+	rw_wunlock(&adapter_list_lock);
+}
+
+static inline void
+remove_adapter(adapter_t *adap)
+{
+	rw_wlock(&adapter_list_lock);
+	TAILQ_REMOVE(&adapter_list, adap, adapter_entry);
+	rw_wunlock(&adapter_list_lock);
+}
+
+/*
+ * Populate a TID_RELEASE WR.  The mbuf must be already propely sized.
+ */
+static inline void
+mk_tid_release(struct mbuf *m, unsigned int tid)
+{
+	struct cpl_tid_release *req;
+
+	m_set_priority(m, CPL_PRIORITY_SETUP);
+	req = mtod(m, struct cpl_tid_release *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+}
+
+static void
+t3_process_tid_release_list(void *data, int pending)
+{
+	struct mbuf *m;
+	struct t3cdev *tdev = data;
+	struct t3c_data *td = T3C_DATA (tdev);
+
+	mtx_lock(&td->tid_release_lock);
+	while (td->tid_release_list) {
+		struct toe_tid_entry *p = td->tid_release_list;
+
+		td->tid_release_list = (struct toe_tid_entry *)p->ctx;
+		mtx_unlock(&td->tid_release_lock);
+		m = m_get(M_WAIT, MT_DATA);
+		mk_tid_release(m, p - td->tid_maps.tid_tab);
+		cxgb_ofld_send(tdev, m);
+		p->ctx = NULL;
+		mtx_lock(&td->tid_release_lock);
+	}
+	mtx_unlock(&td->tid_release_lock);
+}
+
+int
+cxgb_offload_activate(struct adapter *adapter)
+{
+	struct t3cdev *dev = &adapter->tdev;
+	int natids, err;
+	struct t3c_data *t;
+	struct tid_range stid_range, tid_range;
+	struct mtutab mtutab;
+	unsigned int l2t_capacity;
+
+	t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
+	if (!t)
+		return (ENOMEM);
+	dev->adapter = adapter;
+
+	err = (EOPNOTSUPP);
+	if (dev->ctl(dev, GET_TX_MAX_CHUNK, &t->tx_max_chunk) < 0 ||
+	    dev->ctl(dev, GET_MAX_OUTSTANDING_WR, &t->max_wrs) < 0 ||
+	    dev->ctl(dev, GET_L2T_CAPACITY, &l2t_capacity) < 0 ||
+	    dev->ctl(dev, GET_MTUS, &mtutab) < 0 ||
+	    dev->ctl(dev, GET_TID_RANGE, &tid_range) < 0 ||
+	    dev->ctl(dev, GET_STID_RANGE, &stid_range) < 0) {
+		device_printf(adapter->dev, "%s: dev->ctl check failed\n", __FUNCTION__);
+		goto out_free;
+	}
+      
+	err = (ENOMEM);
+	L2DATA(dev) = t3_init_l2t(l2t_capacity);
+	if (!L2DATA(dev)) {
+		device_printf(adapter->dev, "%s: t3_init_l2t failed\n", __FUNCTION__);
+		goto out_free;
+	}
+	natids = min(tid_range.num / 2, MAX_ATIDS);
+	err = init_tid_tabs(&t->tid_maps, tid_range.num, natids,
+			    stid_range.num, ATID_BASE, stid_range.base);
+	if (err) {	
+		device_printf(adapter->dev, "%s: init_tid_tabs failed\n", __FUNCTION__);
+		goto out_free_l2t;
+	}
+	
+	t->mtus = mtutab.mtus;
+	t->nmtus = mtutab.size;
+
+	TASK_INIT(&t->tid_release_task, 0 /* XXX? */, t3_process_tid_release_list, dev);
+	mtx_init(&t->tid_release_lock, "tid release", NULL, MTX_DUPOK|MTX_DEF);
+	t->dev = dev;
+
+	T3C_DATA (dev) = t;
+	dev->recv = process_rx;
+	dev->arp_update = t3_l2t_update;
+	/* Register netevent handler once */
+	if (TAILQ_EMPTY(&adapter_list)) {
+#if defined(CONFIG_CHELSIO_T3_MODULE)
+		if (prepare_arp_with_t3core())
+			log(LOG_ERR, "Unable to set offload capabilities\n");
+#endif
+	}
+	CTR1(KTR_CXGB, "adding adapter %p", adapter); 
+	add_adapter(adapter);
+	device_printf(adapter->dev, "offload started\n");
+	adapter->flags |= CXGB_OFLD_INIT;
+	return (0);
+
+out_free_l2t:
+	t3_free_l2t(L2DATA(dev));
+	L2DATA(dev) = NULL;
+out_free:
+	free(t, M_CXGB);
+	return (err);
+}
+
+void
+cxgb_offload_deactivate(struct adapter *adapter)
+{
+	struct t3cdev *tdev = &adapter->tdev;
+	struct t3c_data *t = T3C_DATA(tdev);
+
+	printf("removing adapter %p\n", adapter);
+	remove_adapter(adapter);
+	if (TAILQ_EMPTY(&adapter_list)) {
+#if defined(CONFIG_CHELSIO_T3_MODULE)
+		restore_arp_sans_t3core();
+#endif
+	}
+	free_tid_maps(&t->tid_maps);
+	T3C_DATA(tdev) = NULL;
+	t3_free_l2t(L2DATA(tdev));
+	L2DATA(tdev) = NULL;
+	mtx_destroy(&t->tid_release_lock);
+	free(t, M_CXGB);
+}
+
+/*
+ * Sends an sk_buff to a T3C driver after dealing with any active network taps.
+ */
+int
+cxgb_ofld_send(struct t3cdev *dev, struct mbuf *m)
+{
+	int r;
+
+	r = dev->send(dev, m);
+	return r;
+}
+
+static struct ifnet *
+get_iff_from_mac(adapter_t *adapter, const uint8_t *mac, unsigned int vlan)
+{
+	int i;
+
+	for_each_port(adapter, i) {
+#ifdef notyet		
+		const struct vlan_group *grp;
+#endif		
+		const struct port_info *p = &adapter->port[i];
+		struct ifnet *ifp = p->ifp;
+
+		if (!memcmp(p->hw_addr, mac, ETHER_ADDR_LEN)) {
+#ifdef notyet	
+			
+			if (vlan && vlan != EVL_VLID_MASK) {
+				grp = p->vlan_grp;
+				dev = grp ? grp->vlan_devices[vlan] : NULL;
+			} else
+				while (dev->master)
+					dev = dev->master;
+#endif			
+			return (ifp);
+		}
+	}
+	return (NULL);
+}
+
+static inline void
+failover_fixup(adapter_t *adapter, int port)
+{
+	if (adapter->params.rev == 0) {
+		struct ifnet *ifp = adapter->port[port].ifp;
+		struct cmac *mac = &adapter->port[port].mac;
+		if (!(ifp->if_flags & IFF_UP)) {
+			/* Failover triggered by the interface ifdown */
+			t3_write_reg(adapter, A_XGM_TX_CTRL + mac->offset,
+				     F_TXEN);
+			t3_read_reg(adapter, A_XGM_TX_CTRL + mac->offset);
+		} else {
+			/* Failover triggered by the interface link down */
+			t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0);
+			t3_read_reg(adapter, A_XGM_RX_CTRL + mac->offset);
+			t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset,
+				     F_RXEN);
+		}
+	}
+}
+
+static int
+cxgb_ulp_iscsi_ctl(adapter_t *adapter, unsigned int req, void *data)
+{
+	int ret = 0;
+	struct ulp_iscsi_info *uiip = data;
+
+	switch (req) {
+	case ULP_ISCSI_GET_PARAMS:
+		uiip->llimit = t3_read_reg(adapter, A_ULPRX_ISCSI_LLIMIT);
+		uiip->ulimit = t3_read_reg(adapter, A_ULPRX_ISCSI_ULIMIT);
+		uiip->tagmask = t3_read_reg(adapter, A_ULPRX_ISCSI_TAGMASK);
+		/*
+		 * On tx, the iscsi pdu has to be <= tx page size and has to
+		 * fit into the Tx PM FIFO.
+		 */
+		uiip->max_txsz = min(adapter->params.tp.tx_pg_size,
+				     t3_read_reg(adapter, A_PM1_TX_CFG) >> 17);
+		/* on rx, the iscsi pdu has to be < rx page size and the
+		   whole pdu + cpl headers has to fit into one sge buffer */
+		/* also check the max rx data length programmed in TP */
+		uiip->max_rxsz = min(uiip->max_rxsz,
+				     ((t3_read_reg(adapter, A_TP_PARA_REG2))
+					>> S_MAXRXDATA) & M_MAXRXDATA);
+		break;
+	case ULP_ISCSI_SET_PARAMS:
+		t3_write_reg(adapter, A_ULPRX_ISCSI_TAGMASK, uiip->tagmask);
+		break;
+	default:
+		ret = (EOPNOTSUPP);
+	}
+	return ret;
+}
+
+/* Response queue used for RDMA events. */
+#define ASYNC_NOTIF_RSPQ 0
+
+static int
+cxgb_rdma_ctl(adapter_t *adapter, unsigned int req, void *data)
+{
+	int ret = 0;
+
+	switch (req) {
+	case RDMA_GET_PARAMS: {
+		struct rdma_info *req = data;
+
+		req->udbell_physbase = rman_get_start(adapter->udbs_res);
+		req->udbell_len = rman_get_size(adapter->udbs_res);
+		req->tpt_base = t3_read_reg(adapter, A_ULPTX_TPT_LLIMIT);
+		req->tpt_top  = t3_read_reg(adapter, A_ULPTX_TPT_ULIMIT);
+		req->pbl_base = t3_read_reg(adapter, A_ULPTX_PBL_LLIMIT);
+		req->pbl_top  = t3_read_reg(adapter, A_ULPTX_PBL_ULIMIT);
+		req->rqt_base = t3_read_reg(adapter, A_ULPRX_RQ_LLIMIT);
+		req->rqt_top  = t3_read_reg(adapter, A_ULPRX_RQ_ULIMIT);
+		req->kdb_addr =  (void *)((unsigned long)rman_get_virtual(adapter->regs_res) + A_SG_KDOORBELL);		break;
+	}
+	case RDMA_CQ_OP: {
+		struct rdma_cq_op *req = data;
+
+		/* may be called in any context */
+		mtx_lock_spin(&adapter->sge.reg_lock);
+		ret = t3_sge_cqcntxt_op(adapter, req->id, req->op,
+					req->credits);
+		mtx_unlock_spin(&adapter->sge.reg_lock);
+		break;
+	}
+	case RDMA_GET_MEM: {
+		struct ch_mem_range *t = data;
+		struct mc7 *mem;
+
+		if ((t->addr & 7) || (t->len & 7))
+			return (EINVAL);
+		if (t->mem_id == MEM_CM)
+			mem = &adapter->cm;
+		else if (t->mem_id == MEM_PMRX)
+			mem = &adapter->pmrx;
+		else if (t->mem_id == MEM_PMTX)
+			mem = &adapter->pmtx;
+		else
+			return (EINVAL);
+
+		ret = t3_mc7_bd_read(mem, t->addr/8, t->len/8, (u64 *)t->buf);
+		if (ret)
+			return (ret);
+		break;
+	}
+	case RDMA_CQ_SETUP: {
+		struct rdma_cq_setup *req = data;
+
+		mtx_lock_spin(&adapter->sge.reg_lock);
+		ret = t3_sge_init_cqcntxt(adapter, req->id, req->base_addr,
+					  req->size, ASYNC_NOTIF_RSPQ,
+					  req->ovfl_mode, req->credits,
+					  req->credit_thres);
+		mtx_unlock_spin(&adapter->sge.reg_lock);
+		break;
+	}
+	case RDMA_CQ_DISABLE:
+		mtx_lock_spin(&adapter->sge.reg_lock);
+		ret = t3_sge_disable_cqcntxt(adapter, *(unsigned int *)data);
+		mtx_unlock_spin(&adapter->sge.reg_lock);
+		break;
+	case RDMA_CTRL_QP_SETUP: {
+		struct rdma_ctrlqp_setup *req = data;
+
+		mtx_lock_spin(&adapter->sge.reg_lock);
+		ret = t3_sge_init_ecntxt(adapter, FW_RI_SGEEC_START, 0,
+					 SGE_CNTXT_RDMA, ASYNC_NOTIF_RSPQ,
+					 req->base_addr, req->size,
+					 FW_RI_TID_START, 1, 0);
+		mtx_unlock_spin(&adapter->sge.reg_lock);
+		break;
+	}
+	default:
+		ret = EOPNOTSUPP;
+	}
+	return (ret);
+}
+
+static int
+cxgb_offload_ctl(struct t3cdev *tdev, unsigned int req, void *data)
+{
+	struct adapter *adapter = tdev2adap(tdev);
+	struct tid_range *tid;
+	struct mtutab *mtup;
+	struct iff_mac *iffmacp;
+	struct ddp_params *ddpp;
+	struct adap_ports *ports;
+	struct ofld_page_info *rx_page_info;
+	struct tp_params *tp = &adapter->params.tp;
+	int port;
+
+	switch (req) {
+	case GET_MAX_OUTSTANDING_WR:
+		*(unsigned int *)data = FW_WR_NUM;
+		break;
+	case GET_WR_LEN:
+		*(unsigned int *)data = WR_FLITS;
+		break;
+	case GET_TX_MAX_CHUNK:
+		*(unsigned int *)data = 1 << 20;  /* 1MB */
+		break;
+	case GET_TID_RANGE:
+		tid = data;
+		tid->num = t3_mc5_size(&adapter->mc5) -
+			adapter->params.mc5.nroutes -
+			adapter->params.mc5.nfilters -
+			adapter->params.mc5.nservers;
+		tid->base = 0;
+		break;
+	case GET_STID_RANGE:
+		tid = data;
+		tid->num = adapter->params.mc5.nservers;
+		tid->base = t3_mc5_size(&adapter->mc5) - tid->num -
+			adapter->params.mc5.nfilters -
+			adapter->params.mc5.nroutes;
+		break;
+	case GET_L2T_CAPACITY:
+		*(unsigned int *)data = 2048;
+		break;
+	case GET_MTUS:
+		mtup = data;
+		mtup->size = NMTUS;
+		mtup->mtus = adapter->params.mtus;
+		break;
+	case GET_IFF_FROM_MAC:
+		iffmacp = data;
+		iffmacp->dev = get_iff_from_mac(adapter, iffmacp->mac_addr,
+					  iffmacp->vlan_tag & EVL_VLID_MASK);
+		break;
+	case GET_DDP_PARAMS:
+		ddpp = data;
+		ddpp->llimit = t3_read_reg(adapter, A_ULPRX_TDDP_LLIMIT);
+		ddpp->ulimit = t3_read_reg(adapter, A_ULPRX_TDDP_ULIMIT);
+		ddpp->tag_mask = t3_read_reg(adapter, A_ULPRX_TDDP_TAGMASK);
+		break;
+	case GET_PORTS:
+		ports = data;
+		ports->nports   = adapter->params.nports;
+		for_each_port(adapter, port)
+			ports->lldevs[port] = adapter->port[port].ifp;
+		break;
+	case FAILOVER:
+		port = *(int *)data;
+		t3_port_failover(adapter, port);
+		failover_fixup(adapter, port);
+		break;
+	case FAILOVER_DONE:
+		port = *(int *)data;
+		t3_failover_done(adapter, port);
+		break;
+	case FAILOVER_CLEAR:
+		t3_failover_clear(adapter);
+		break;
+	case GET_RX_PAGE_INFO:
+		rx_page_info = data;
+		rx_page_info->page_size = tp->rx_pg_size;
+		rx_page_info->num = tp->rx_num_pgs;
+		break;
+	case ULP_ISCSI_GET_PARAMS:
+	case ULP_ISCSI_SET_PARAMS:
+		if (!offload_running(adapter))
+			return (EAGAIN);
+		return cxgb_ulp_iscsi_ctl(adapter, req, data);
+	case RDMA_GET_PARAMS:
+	case RDMA_CQ_OP:
+	case RDMA_CQ_SETUP:
+	case RDMA_CQ_DISABLE:
+	case RDMA_CTRL_QP_SETUP:
+	case RDMA_GET_MEM:
+		if (!offload_running(adapter))
+			return (EAGAIN);
+		return cxgb_rdma_ctl(adapter, req, data);
+	default:
+		return (EOPNOTSUPP);
+	}
+	return 0;
+}
+
+/*
+ * Allocate a TOM data structure,
+ * initialize its cpl_handlers
+ * and register it as a T3C client
+ */
+static void
+t3c_tom_add(struct t3cdev *cdev)
+{
+	int i;
+	unsigned int wr_len;
+	struct tom_data *t;
+	struct toedev *tdev;
+	struct adap_ports *port_info;
+
+	t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
+	if (t == NULL)
+		return;
+
+	cdev->send = t3_offload_tx;
+	cdev->ctl = cxgb_offload_ctl;
+	
+	if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0)
+		goto out_free_tom;
+
+	port_info = malloc(sizeof(*port_info), M_CXGB, M_NOWAIT|M_ZERO);
+	if (!port_info)
+		goto out_free_tom;
+
+	if (cdev->ctl(cdev, GET_PORTS, port_info) < 0)
+		goto out_free_all;
+
+	t3_init_wr_tab(wr_len);
+	t->cdev = cdev;
+	t->client = &t3c_tom_client;
+
+	/* Register TCP offload device */
+	tdev = &t->tdev;
+	tdev->tod_ttid = cdev2type(cdev);
+	tdev->tod_lldev = cdev->lldev;
+	
+	if (register_toedev(tdev, "toe%d")) {
+		printf("unable to register offload device");
+		goto out_free_all;
+	}
+	TOM_DATA(tdev) = t;
+
+	for (i = 0; i < port_info->nports; i++) {
+		struct ifnet *ifp = port_info->lldevs[i];
+		TOEDEV(ifp) = tdev;
+
+		CTR1(KTR_TOM, "enabling toe on %p", ifp);
+		ifp->if_capabilities |= IFCAP_TOE4;
+		ifp->if_capenable |= IFCAP_TOE4;
+	}
+	t->ports = port_info;
+
+	/* Add device to the list of offload devices */
+	t3cdev_add(t);
+
+	/* Activate TCP offload device */
+	cxgb_offload_activate(TOM_DATA(tdev)->cdev->adapter);
+
+	activate_offload(tdev);
+	cxgb_register_listeners();
+	return;
+
+out_free_all:
+	printf("out_free_all fail\n");
+	free(port_info, M_CXGB);
+out_free_tom:
+	printf("out_free_tom fail\n");
+	free(t, M_CXGB);
+	return;
+}
+
+
+
+static int
+do_act_open_rpl(struct t3cdev *dev, struct mbuf *m)
+{
+	struct cpl_act_open_rpl *rpl = cplhdr(m);
+	unsigned int atid = G_TID(ntohl(rpl->atid));
+	struct toe_tid_entry *toe_tid;
+
+	toe_tid = lookup_atid(&(T3C_DATA (dev))->tid_maps, atid);
+	if (toe_tid->ctx && toe_tid->client && toe_tid->client->handlers &&
+		toe_tid->client->handlers[CPL_ACT_OPEN_RPL]) {
+		return toe_tid->client->handlers[CPL_ACT_OPEN_RPL] (dev, m,
+			toe_tid->ctx);
+	} else {
+		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
+			dev->name, CPL_ACT_OPEN_RPL);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int
+do_stid_rpl(struct t3cdev *dev, struct mbuf *m)
+{
+	union opcode_tid *p = cplhdr(m);
+	unsigned int stid = G_TID(ntohl(p->opcode_tid));
+	struct toe_tid_entry *toe_tid;
+
+	toe_tid = lookup_stid(&(T3C_DATA (dev))->tid_maps, stid);
+	if (toe_tid->ctx && toe_tid->client->handlers &&
+		toe_tid->client->handlers[p->opcode]) {
+		return toe_tid->client->handlers[p->opcode] (dev, m, toe_tid->ctx);
+	} else {
+		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
+			dev->name, p->opcode);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int
+do_hwtid_rpl(struct t3cdev *dev, struct mbuf *m)
+{
+	union opcode_tid *p = cplhdr(m);
+	unsigned int hwtid;
+	struct toe_tid_entry *toe_tid;
+	
+	DPRINTF("do_hwtid_rpl opcode=0x%x\n", p->opcode);
+	hwtid = G_TID(ntohl(p->opcode_tid));
+
+	toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid);
+	if (toe_tid->ctx && toe_tid->client->handlers &&
+		toe_tid->client->handlers[p->opcode]) {
+		return toe_tid->client->handlers[p->opcode]
+						(dev, m, toe_tid->ctx);
+	} else {
+		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
+			dev->name, p->opcode);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int
+do_cr(struct t3cdev *dev, struct mbuf *m)
+{
+	struct cpl_pass_accept_req *req = cplhdr(m);
+	unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	struct toe_tid_entry *toe_tid;
+
+	toe_tid = lookup_stid(&(T3C_DATA (dev))->tid_maps, stid);
+	if (toe_tid->ctx && toe_tid->client->handlers &&
+		toe_tid->client->handlers[CPL_PASS_ACCEPT_REQ]) {
+		return toe_tid->client->handlers[CPL_PASS_ACCEPT_REQ]
+						(dev, m, toe_tid->ctx);
+	} else {
+		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
+			dev->name, CPL_PASS_ACCEPT_REQ);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int
+do_abort_req_rss(struct t3cdev *dev, struct mbuf *m)
+{
+	union opcode_tid *p = cplhdr(m);
+	unsigned int hwtid = G_TID(ntohl(p->opcode_tid));
+	struct toe_tid_entry *toe_tid;
+
+	toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid);
+	if (toe_tid->ctx && toe_tid->client->handlers &&
+		toe_tid->client->handlers[p->opcode]) {
+		return toe_tid->client->handlers[p->opcode]
+						(dev, m, toe_tid->ctx);
+	} else {
+		struct cpl_abort_req_rss *req = cplhdr(m);
+		struct cpl_abort_rpl *rpl;
+		
+		struct mbuf *m = m_get(M_NOWAIT, MT_DATA);
+		if (!m) {
+			log(LOG_NOTICE, "do_abort_req_rss: couldn't get mbuf!\n");
+			goto out;
+		}
+
+		m_set_priority(m, CPL_PRIORITY_DATA);
+		rpl = cplhdr(m);
+		rpl->wr.wr_hi = 
+			htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+		rpl->wr.wr_lo = htonl(V_WR_TID(GET_TID(req)));
+		OPCODE_TID(rpl) =
+			htonl(MK_OPCODE_TID(CPL_ABORT_RPL, GET_TID(req)));
+		rpl->cmd = req->status;
+		cxgb_ofld_send(dev, m);
+ out:
+		return (CPL_RET_BUF_DONE);
+	}
+}
+
+static int
+do_act_establish(struct t3cdev *dev, struct mbuf *m)
+{
+	struct cpl_act_establish *req;
+	unsigned int atid;
+	struct toe_tid_entry *toe_tid;
+
+	req = cplhdr(m);
+	atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	toe_tid = lookup_atid(&(T3C_DATA (dev))->tid_maps, atid);
+	if (toe_tid && toe_tid->ctx && toe_tid->client->handlers &&
+		toe_tid->client->handlers[CPL_ACT_ESTABLISH]) {
+		
+		return toe_tid->client->handlers[CPL_ACT_ESTABLISH]
+						(dev, m, toe_tid->ctx);
+	} else {
+	
+		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
+			dev->name, CPL_PASS_ACCEPT_REQ);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+
+static int
+do_term(struct t3cdev *dev, struct mbuf *m)
+{
+	unsigned int hwtid = ntohl(m_get_priority(m)) >> 8 & 0xfffff;
+	unsigned int opcode = G_OPCODE(ntohl(m->m_pkthdr.csum_data));
+	struct toe_tid_entry *toe_tid;
+
+	toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid);
+	if (toe_tid && toe_tid->ctx && toe_tid->client->handlers &&
+		toe_tid->client->handlers[opcode]) {
+		return toe_tid->client->handlers[opcode](dev, m, toe_tid->ctx);
+	} else {
+		log(LOG_ERR, "%s: received clientless CPL command 0x%x\n",
+			dev->name, opcode);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+	return (0);
+}
+
+/*
+ * Process a received packet with an unknown/unexpected CPL opcode.
+ */
+static int
+do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+	log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name,
+	    0xFF & *mtod(m, unsigned int *));
+	return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG);
+}
+
+/*
+ * Add a new handler to the CPL dispatch table.  A NULL handler may be supplied
+ * to unregister an existing handler.
+ */
+void
+t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h)
+{
+	if (opcode < UCHAR_MAX)
+		tom_cpl_handlers[opcode] = h ? h : do_bad_cpl;
+	else
+		log(LOG_ERR, "Chelsio T3 TOM: handler registration for "
+		       "opcode %u failed\n", opcode);
+}
+
+/*
+ * Make a preliminary determination if a connection can be offloaded.  It's OK
+ * to fail the offload later if we say we can offload here.  For now this
+ * always accepts the offload request unless there are IP options.
+ */
+static int
+can_offload(struct toedev *dev, struct socket *so)
+{
+	struct tom_data *tomd = TOM_DATA(dev);
+	struct t3cdev *cdev = T3CDEV(dev->tod_lldev);
+	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
+
+	return so_sotoinpcb(so)->inp_depend4.inp4_options == NULL &&
+	    tomd->conf.activated &&
+	    (tomd->conf.max_conn < 0 ||
+	     atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn);
+}
+
+static int
+tom_ctl(struct toedev *dev, unsigned int req, void *data)
+{
+	struct tom_data *t = TOM_DATA(dev);
+	struct t3cdev *cdev = t->cdev;
+
+	if (cdev->ctl)
+		return cdev->ctl(cdev, req, data);
+
+	return (EOPNOTSUPP);
+}
+
+/*
+ * Free an active-open TID.
+ */
+void *
+cxgb_free_atid(struct t3cdev *tdev, int atid)
+{
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+	union active_open_entry *p = atid2entry(t, atid);
+	void *ctx = p->toe_tid.ctx;
+
+	mtx_lock(&t->atid_lock);
+	p->next = t->afree;
+	t->afree = p;
+	t->atids_in_use--;
+	mtx_unlock(&t->atid_lock);
+
+	return ctx;
+}
+
+/*
+ * Free a server TID and return it to the free pool.
+ */
+void
+cxgb_free_stid(struct t3cdev *tdev, int stid)
+{
+	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
+	union listen_entry *p = stid2entry(t, stid);
+
+	mtx_lock(&t->stid_lock);
+	p->next = t->sfree;
+	t->sfree = p;
+	t->stids_in_use--;
+	mtx_unlock(&t->stid_lock);
+}
+
+/*
+ * Free a server TID and return it to the free pool.
+ */
+void *
+cxgb_get_lctx(struct t3cdev *tdev, int stid)
+{
+	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
+	union listen_entry *p = stid2entry(t, stid);
+
+	return (p->toe_tid.ctx);
+}
+
+void
+cxgb_insert_tid(struct t3cdev *tdev, struct cxgb_client *client,
+	void *ctx, unsigned int tid)
+{
+	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
+
+	t->tid_tab[tid].client = client;
+	t->tid_tab[tid].ctx = ctx;
+	atomic_add_int(&t->tids_in_use, 1);
+}
+
+/* use ctx as a next pointer in the tid release list */
+void
+cxgb_queue_tid_release(struct t3cdev *tdev, unsigned int tid)
+{
+	struct t3c_data *td = T3C_DATA (tdev);
+	struct toe_tid_entry *p = &td->tid_maps.tid_tab[tid];
+	
+	CTR0(KTR_TOM, "queuing tid release\n");
+	
+	mtx_lock(&td->tid_release_lock);
+	p->ctx = td->tid_release_list;
+	td->tid_release_list = p;
+
+	if (!p->ctx)
+		taskqueue_enqueue(tdev->adapter->tq, &td->tid_release_task);
+
+	mtx_unlock(&td->tid_release_lock);
+}
+
+/*
+ * Remove a tid from the TID table.  A client may defer processing its last
+ * CPL message if it is locked at the time it arrives, and while the message
+ * sits in the client's backlog the TID may be reused for another connection.
+ * To handle this we atomically switch the TID association if it still points
+ * to the original client context.
+ */
+void
+cxgb_remove_tid(struct t3cdev *tdev, void *ctx, unsigned int tid)
+{
+	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
+
+	if (tid >= t->ntids)
+		panic("tid=%d >= t->ntids=%d", tid, t->ntids);
+	
+	if (tdev->type == T3A)
+		atomic_cmpset_ptr((uintptr_t *)&t->tid_tab[tid].ctx, (long)NULL, (long)ctx);
+	else {
+		struct mbuf *m;
+
+		m = m_get(M_NOWAIT, MT_DATA);
+		if (__predict_true(m != NULL)) {
+			mk_tid_release(m, tid);
+			CTR1(KTR_CXGB, "releasing tid=%u", tid);
+			
+			cxgb_ofld_send(tdev, m);
+			t->tid_tab[tid].ctx = NULL;
+		} else
+			cxgb_queue_tid_release(tdev, tid);
+	}
+	atomic_add_int(&t->tids_in_use, -1);
+}
+
+int
+cxgb_alloc_atid(struct t3cdev *tdev, struct cxgb_client *client,
+		     void *ctx)
+{
+	int atid = -1;
+	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
+
+	mtx_lock(&t->atid_lock);
+	if (t->afree) {
+		union active_open_entry *p = t->afree;
+
+		atid = (p - t->atid_tab) + t->atid_base;
+		t->afree = p->next;
+		p->toe_tid.ctx = ctx;
+		p->toe_tid.client = client;
+		t->atids_in_use++;
+	}
+	mtx_unlock(&t->atid_lock);
+	return atid;
+}
+
+int
+cxgb_alloc_stid(struct t3cdev *tdev, struct cxgb_client *client,
+		     void *ctx)
+{
+	int stid = -1;
+	struct tid_info *t = &(T3C_DATA (tdev))->tid_maps;
+
+	mtx_lock(&t->stid_lock);
+	if (t->sfree) {
+		union listen_entry *p = t->sfree;
+
+		stid = (p - t->stid_tab) + t->stid_base;
+		t->sfree = p->next;
+		p->toe_tid.ctx = ctx;
+		p->toe_tid.client = client;
+		t->stids_in_use++;
+	}
+	mtx_unlock(&t->stid_lock);
+	return stid;
+}
+
+
+static int
+is_offloading(struct ifnet *ifp)
+{
+	struct adapter *adapter;
+	int port;
+
+	rw_rlock(&adapter_list_lock);
+	TAILQ_FOREACH(adapter, &adapter_list, adapter_entry) {
+		for_each_port(adapter, port) {
+			if (ifp == adapter->port[port].ifp) {
+				rw_runlock(&adapter_list_lock);
+				return 1;
+			}
+		}
+	}
+	rw_runlock(&adapter_list_lock);
+	return 0;
+}
+
+
+static void
+cxgb_arp_update_event(void *unused, struct rtentry *rt0,
+    uint8_t *enaddr, struct sockaddr *sa)
+{
+
+	if (!is_offloading(rt0->rt_ifp))
+		return;
+
+	RT_ADDREF(rt0);
+	RT_UNLOCK(rt0);
+	cxgb_neigh_update(rt0, enaddr, sa);
+	RT_LOCK(rt0);
+	RT_REMREF(rt0);
+}
+
+static void
+cxgb_redirect_event(void *unused, int event, struct rtentry *rt0,
+    struct rtentry *rt1, struct sockaddr *sa)
+{
+	/* 
+	 * ignore events on non-offloaded interfaces
+	 */
+	if (!is_offloading(rt0->rt_ifp))
+		return;
+
+	/*
+	 * Cannot redirect to non-offload device.
+	 */
+	if (!is_offloading(rt1->rt_ifp)) {
+		log(LOG_WARNING, "%s: Redirect to non-offload"
+		    "device ignored.\n", __FUNCTION__);
+		return;
+	}
+
+        /*
+	 * avoid LORs by dropping the route lock but keeping a reference
+	 * 
+	 */
+	RT_ADDREF(rt0);
+	RT_UNLOCK(rt0);
+	RT_ADDREF(rt1);
+	RT_UNLOCK(rt1);
+	
+	cxgb_redirect(rt0, rt1, sa);
+	cxgb_neigh_update(rt1, NULL, sa);
+
+	RT_LOCK(rt0);
+	RT_REMREF(rt0);
+	RT_LOCK(rt1);
+	RT_REMREF(rt1);
+}
+
+void
+cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa)
+{
+
+	if (rt->rt_ifp && is_offloading(rt->rt_ifp) && (rt->rt_ifp->if_flags & IFCAP_TOE)) {
+		struct t3cdev *tdev = T3CDEV(rt->rt_ifp);
+
+		PANIC_IF(!tdev);
+		t3_l2t_update(tdev, rt, enaddr, sa);
+	}
+}
+
+static void
+set_l2t_ix(struct t3cdev *tdev, u32 tid, struct l2t_entry *e)
+{
+	struct mbuf *m;
+	struct cpl_set_tcb_field *req;
+
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (!m) {
+		log(LOG_ERR, "%s: cannot allocate mbuf!\n", __FUNCTION__);
+		return;
+	}
+	
+	m_set_priority(m, CPL_PRIORITY_CONTROL);
+	req = mtod(m, struct cpl_set_tcb_field *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_L2T_IX);
+	req->mask = htobe64(V_TCB_L2T_IX(M_TCB_L2T_IX));
+	req->val = htobe64(V_TCB_L2T_IX(e->idx));
+	tdev->send(tdev, m);
+}
+
+void
+cxgb_redirect(struct rtentry *old, struct rtentry *new, struct sockaddr *sa)
+{
+	struct ifnet *olddev, *newdev;
+	struct tid_info *ti;
+	struct t3cdev *tdev;
+	u32 tid;
+	int update_tcb;
+	struct l2t_entry *e;
+	struct toe_tid_entry *te;
+
+	olddev = old->rt_ifp;
+	newdev = new->rt_ifp;
+	if (!is_offloading(olddev))
+		return;
+	if (!is_offloading(newdev)) {
+		log(LOG_WARNING, "%s: Redirect to non-offload"
+		    "device ignored.\n", __FUNCTION__);
+		return;
+	}
+	tdev = T3CDEV(olddev);
+	PANIC_IF(!tdev);
+	if (tdev != T3CDEV(newdev)) {
+		log(LOG_WARNING, "%s: Redirect to different "
+		    "offload device ignored.\n", __FUNCTION__);
+		return;
+	}
+
+	/* Add new L2T entry */
+	e = t3_l2t_get(tdev, new, new->rt_ifp, sa);
+	if (!e) {
+		log(LOG_ERR, "%s: couldn't allocate new l2t entry!\n",
+		       __FUNCTION__);
+		return;
+	}
+
+	/* Walk tid table and notify clients of dst change. */
+	ti = &(T3C_DATA (tdev))->tid_maps;
+	for (tid=0; tid < ti->ntids; tid++) {
+		te = lookup_tid(ti, tid);
+		PANIC_IF(!te);
+		if (te->ctx && te->client && te->client->redirect) {
+			update_tcb = te->client->redirect(te->ctx, old, new,
+							  e);
+			if (update_tcb)  {
+				l2t_hold(L2DATA(tdev), e);
+				set_l2t_ix(tdev, tid, e);
+			}
+		}
+	}
+	l2t_release(L2DATA(tdev), e);
+}
+
+/*
+ * Initialize the CPL dispatch table.
+ */
+static void
+init_cpl_handlers(void)
+{
+	int i;
+
+	for (i = 0; i < 256; ++i)
+		tom_cpl_handlers[i] = do_bad_cpl;
+
+	t3_init_listen_cpl_handlers();
+}
+
+static int
+t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
+{
+	struct tom_data *t = TOM_DATA(dev);
+	struct t3cdev *cdev = t->cdev;
+	struct ddp_params ddp;
+	struct ofld_page_info rx_page_info;
+	int err;
+	
+	t3_init_tunables(t);
+	mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF);
+	CTR2(KTR_TOM, "t3_toe_attach dev=%p entry=%p", dev, entry);
+	/* Adjust TOE activation for this module */
+	t->conf.activated = activated;
+
+	dev->tod_can_offload = can_offload;
+	dev->tod_connect = t3_connect;
+	dev->tod_ctl = tom_ctl;
+#if 0	
+	dev->tod_failover = t3_failover;
+#endif
+	err = cdev->ctl(cdev, GET_DDP_PARAMS, &ddp);
+	if (err)
+		return err;
+
+	err = cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info);
+	if (err)
+		return err;
+
+	t->ddp_llimit = ddp.llimit;
+	t->ddp_ulimit = ddp.ulimit;
+	t->pdev = ddp.pdev;
+	t->rx_page_size = rx_page_info.page_size;
+	/* OK if this fails, we just can't do DDP */
+	t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE;
+	t->ppod_map = malloc(t->nppods, M_DEVBUF, M_NOWAIT|M_ZERO);
+
+	mtx_init(&t->ppod_map_lock, "ppod map", NULL, MTX_DEF);
+
+
+	t3_sysctl_register(cdev->adapter, &t->conf);
+	return (0);
+}
+
+static void
+cxgb_toe_listen_start(void *unused, struct tcpcb *tp)
+{
+	struct socket *so = inp_inpcbtosocket(tp->t_inpcb);
+	struct tom_data *p;
+	
+	mtx_lock(&cxgb_list_lock);
+	TAILQ_FOREACH(p, &cxgb_list, entry) {
+			t3_listen_start(&p->tdev, so, p->cdev);
+	}
+	mtx_unlock(&cxgb_list_lock);
+}
+
+static void
+cxgb_toe_listen_stop(void *unused, struct tcpcb *tp)
+{
+	struct socket *so = inp_inpcbtosocket(tp->t_inpcb);
+	struct tom_data *p;
+	
+	mtx_lock(&cxgb_list_lock);
+	TAILQ_FOREACH(p, &cxgb_list, entry) {
+		if (tp->t_state == TCPS_LISTEN)
+			t3_listen_stop(&p->tdev, so, p->cdev);
+	}
+	mtx_unlock(&cxgb_list_lock);
+}
+
+static void
+cxgb_toe_listen_start_handler(struct inpcb *inp, void *arg)
+{
+	struct tcpcb *tp = intotcpcb(inp);
+
+	if (tp->t_state == TCPS_LISTEN)
+		cxgb_toe_listen_start(NULL, tp);
+}
+
+static void
+cxgb_register_listeners(void)
+{
+
+	inp_apply_all(cxgb_toe_listen_start_handler, NULL);
+}
+
+static int
+t3_tom_init(void)
+{
+	init_cpl_handlers();
+	if (t3_init_cpl_io() < 0) {
+		log(LOG_ERR,
+		    "Unable to initialize cpl io ops\n");
+		return -1;
+	}
+	t3_init_socket_ops();
+
+	 /* Register with the TOE device layer. */
+
+	if (register_tom(&t3_tom_info) != 0) {
+		log(LOG_ERR,
+		    "Unable to register Chelsio T3 TCP offload module.\n");
+		return -1;
+	}
+
+	rw_init(&adapter_list_lock, "ofld adap list");
+	TAILQ_INIT(&adapter_list);
+	EVENTHANDLER_REGISTER(route_arp_update_event, cxgb_arp_update_event,
+	    NULL, EVENTHANDLER_PRI_ANY);
+	EVENTHANDLER_REGISTER(route_redirect_event, cxgb_redirect_event,
+	    NULL, EVENTHANDLER_PRI_ANY);
+	
+	mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF);
+	listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_start,
+	    cxgb_toe_listen_start, NULL, EVENTHANDLER_PRI_ANY);
+	listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_stop,
+	    cxgb_toe_listen_stop, NULL, EVENTHANDLER_PRI_ANY);
+	TAILQ_INIT(&cxgb_list);
+	
+
+
+	t3_register_cpl_handler(CPL_PASS_OPEN_RPL, do_stid_rpl);
+	t3_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_stid_rpl);
+	t3_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_cr);
+	t3_register_cpl_handler(CPL_PASS_ESTABLISH, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ABORT_RPL_RSS, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ABORT_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_URG_NOTIFY, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_DATA, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_TX_DATA_ACK, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_TX_DMA_ACK, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
+	t3_register_cpl_handler(CPL_PEER_CLOSE, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_CLOSE_CON_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req_rss);
+	t3_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
+	t3_register_cpl_handler(CPL_RDMA_TERMINATE, do_term);
+	t3_register_cpl_handler(CPL_RDMA_EC_STATUS, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_DATA_DDP, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_GET_TCB_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_SET_TCB_RPL, do_hwtid_rpl);
+
+	/* Register to offloading devices */
+	cxgb_register_client(&t3c_tom_client);
+	
+	return (0);
+}
+
+static int
+t3_tom_load(module_t mod, int cmd, void *arg)
+{
+	int err = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		t3_tom_init();
+		break;
+	case MOD_QUIESCE:
+		break;
+	case MOD_UNLOAD:
+		printf("uhm, ... unloading isn't really supported for toe\n");
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		err = EOPNOTSUPP;
+		break;
+	}
+
+	return (err);
+}
+
+static moduledata_t mod_data= {
+	"t3_tom",
+	t3_tom_load,
+	0
+};
+MODULE_VERSION(t3_tom, 1);
+MODULE_DEPEND(t3_tom, toecore, 1, 1, 1);
+MODULE_DEPEND(t3_tom, if_cxgb, 1, 1, 1);
+DECLARE_MODULE(t3_tom, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.h b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
new file mode 100644
index 0000000000000..bcda2c3c57aaa
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h
@@ -0,0 +1,159 @@
+
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef CXGB_TOM_H_
+#define CXGB_TOM_H_
+#include <sys/protosw.h>
+
+#define LISTEN_INFO_HASH_SIZE 32 
+
+struct listen_info {
+	struct listen_info *next;  /* Link to next entry */
+	struct socket *so;         /* The listening socket */
+	unsigned int stid;         /* The server TID */
+};
+
+
+/*
+ * TOM tunable parameters.  They can be manipulated through sysctl(2) or /proc.
+ */
+struct tom_tunables {
+        int max_host_sndbuf;    // max host RAM consumed by a sndbuf
+        int tx_hold_thres;      // push/pull threshold for non-full TX sk_buffs
+        int max_wrs;            // max # of outstanding WRs per connection
+        int rx_credit_thres;    // min # of RX credits needed for RX_DATA_ACK
+        int cong_alg;           // Congestion control algorithm
+        int mss;                // max TX_DATA WR payload size
+        int delack;             // delayed ACK control
+        int max_conn;           // maximum number of offloaded connections
+        int soft_backlog_limit; // whether the listen backlog limit is soft
+        int ddp;                // whether to put new connections in DDP mode
+        int ddp_thres;          // min recvmsg size before activating DDP
+        int ddp_copy_limit;     // capacity of kernel DDP buffer
+        int ddp_push_wait;      // whether blocking DDP waits for PSH flag
+        int ddp_rcvcoalesce;    // whether receive coalescing is enabled
+        int zcopy_sosend_enabled; // < is never zcopied
+        int zcopy_sosend_partial_thres; // < is never zcopied
+        int zcopy_sosend_partial_copy; // bytes copied in partial zcopy
+        int zcopy_sosend_thres;// >= are mostly zcopied
+        int zcopy_sosend_copy; // bytes coped in zcopied
+        int zcopy_sosend_ret_pending_dma;// pot. return while pending DMA
+        int activated;          // TOE engine activation state
+};
+
+struct tom_data {
+        TAILQ_ENTRY(tom_data) entry;
+			      
+        struct t3cdev *cdev;
+        struct pci_dev *pdev;
+        struct toedev tdev;
+
+        struct cxgb_client *client;
+        struct tom_tunables conf;
+        struct tom_sysctl_table *sysctl;
+
+        /*
+         * The next three locks listen_lock, deferq.lock, and tid_release_lock
+         * are used rarely so we let them potentially share a cacheline.
+         */
+
+        struct listen_info *listen_hash_tab[LISTEN_INFO_HASH_SIZE];
+        struct mtx listen_lock;
+
+        struct mbuf_head deferq;
+        struct task deferq_task;
+
+        struct socket **tid_release_list;
+        struct mtx tid_release_lock;
+        struct task tid_release_task;
+
+        volatile int tx_dma_pending;
+	
+        unsigned int ddp_llimit;
+        unsigned int ddp_ulimit;
+
+        unsigned int rx_page_size;
+
+        u8 *ppod_map;
+        unsigned int nppods;
+        struct mtx ppod_map_lock;
+	
+        struct adap_ports *ports;
+	struct taskqueue *tq;
+};
+
+
+struct listen_ctx {
+	struct socket *lso;
+	struct tom_data *tom_data;
+	int ulp_mode;
+	LIST_HEAD(, toepcb) synq_head;
+	
+};
+
+#define TOM_DATA(dev) (*(struct tom_data **)&(dev)->tod_l4opt)
+#define T3C_DEV(sk) ((TOM_DATA(TOE_DEV(sk)))->cdev)
+#define TOEP_T3C_DEV(toep) (TOM_DATA(toep->tp_toedev)->cdev)
+#define TOM_TUNABLE(dev, param) (TOM_DATA(dev)->conf.param)
+
+#define TP_DATASENT         	(1 << 0)
+#define TP_TX_WAIT_IDLE      	(1 << 1)
+#define TP_FIN_SENT          	(1 << 2)
+#define TP_ABORT_RPL_PENDING 	(1 << 3)
+#define TP_ABORT_SHUTDOWN    	(1 << 4)
+#define TP_ABORT_RPL_RCVD    	(1 << 5)
+#define TP_ABORT_REQ_RCVD    	(1 << 6)
+#define TP_CLOSE_CON_REQUESTED	(1 << 7)
+#define TP_SYN_RCVD		(1 << 8)
+#define TP_ESTABLISHED		(1 << 9)
+
+void t3_init_tunables(struct tom_data *t);
+
+void t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p);
+
+static __inline struct mbuf *
+m_gethdr_nofail(int len)
+{
+	struct mbuf *m;
+	
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (m == NULL) {
+		panic("implement lowmem cache\n");
+	}
+	
+	KASSERT(len < MHLEN, ("requested header size too large for mbuf"));	
+	m->m_pkthdr.len = m->m_len = len;
+	return (m);
+}
+
+
+#endif
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
new file mode 100644
index 0000000000000..1490bfbdc29bd
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c
@@ -0,0 +1,119 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+
+#include <dev/cxgb/cxgb_osdep.h>
+#include <dev/cxgb/sys/mbufq.h>
+
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_fsm.h>
+#include <net/route.h>
+
+#include <dev/cxgb/t3cdev.h>
+#include <dev/cxgb/common/cxgb_firmware_exports.h>
+#include <dev/cxgb/common/cxgb_tcb.h>
+#include <dev/cxgb/common/cxgb_ctl_defs.h>
+#include <dev/cxgb/common/cxgb_t3_cpl.h>
+#include <dev/cxgb/cxgb_offload.h>
+#include <dev/cxgb/cxgb_include.h>
+#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
+#include <dev/cxgb/ulp/tom/cxgb_tom.h>
+#include <dev/cxgb/ulp/tom/cxgb_defs.h>
+#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
+
+static struct tom_tunables default_tunable_vals = {
+	.max_host_sndbuf = 32 * 1024,
+	.tx_hold_thres = 0,
+	.max_wrs = 15,
+	.rx_credit_thres = 15 * 1024,
+	.cong_alg = -1,
+	.mss = 16384,
+	.delack = 1,
+	.max_conn = -1,
+	.soft_backlog_limit = 0,
+	.ddp = 1,
+	.ddp_thres = 14 * 4096,
+	.ddp_copy_limit = 13 * 4096,
+	.ddp_push_wait = 1,
+	.ddp_rcvcoalesce = 0,
+	.zcopy_sosend_enabled = 0,	
+	.zcopy_sosend_partial_thres = 40960,
+	.zcopy_sosend_partial_copy = 4096 * 3,
+	.zcopy_sosend_thres = 128 * 1024,
+	.zcopy_sosend_copy = 4096 * 2,
+	.zcopy_sosend_ret_pending_dma = 1,
+	.activated = 1,
+};
+
+void
+t3_init_tunables(struct tom_data *t)
+{
+	t->conf = default_tunable_vals;
+
+	/* Now apply device specific fixups. */
+	t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk;
+	t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs;
+}
+
+void
+t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p)
+{
+	struct sysctl_ctx_list *ctx;
+	struct sysctl_oid_list *children;
+
+	ctx = device_get_sysctl_ctx(sc->dev);
+	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
+	
+}
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.c b/sys/dev/cxgb/ulp/tom/cxgb_vm.c
new file mode 100644
index 0000000000000..7036005e93e04
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.c
@@ -0,0 +1,180 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/condvar.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
+
+#define TRACE_ENTER printf("%s:%s entered", __FUNCTION__, __FILE__)
+#define TRACE_EXIT printf("%s:%s:%d exited", __FUNCTION__, __FILE__, __LINE__)
+
+/*
+ * This routine takes a user address range and does the following:
+ *  - validate that the user has access to those pages (flags indicates read or write) - if not fail
+ *  - validate that count is enough to hold range number of pages - if not fail
+ *  - fault in any non-resident pages
+ *  - if the user is doing a read force a write fault for any COWed pages
+ *  - if the user is doing a read mark all pages as dirty
+ *  - hold all pages
+ *  - return number of pages in count
+ */
+int
+vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags)
+{
+
+	vm_offset_t end, va;
+	vm_paddr_t pa;
+	int faults, rv;
+
+	struct thread *td;
+	vm_map_t map;
+	pmap_t pmap;
+	vm_page_t m, *pages;
+	vm_prot_t prot;
+	
+
+	/*
+	 * Check that virtual address range is legal
+	 * This check is somewhat bogus as on some architectures kernel
+	 * and user do not share VA - however, it appears that all FreeBSD
+	 * architectures define it
+	 */
+	end = addr + (count * PAGE_SIZE);
+	if (end > VM_MAXUSER_ADDRESS) {
+		printf("bad address passed\n");
+		return (EFAULT);
+	}
+
+	td = curthread;
+	map = &td->td_proc->p_vmspace->vm_map;
+	pmap = &td->td_proc->p_vmspace->vm_pmap;
+	pages = mp;
+
+	prot = VM_PROT_READ;
+	prot |= (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : 0;
+	bzero(pages, sizeof(vm_page_t *) * count);
+retry:
+
+	/*
+	 * First optimistically assume that all pages are resident (and R/W if for write)
+	 * if so just mark pages as held (and dirty if for write) and return
+	 */
+	vm_page_lock_queues();
+	for (pages = mp, faults = 0, va = addr; va < end; va += PAGE_SIZE, pages++) {
+		/*
+		 * Assure that we only hold the page once
+		 */
+		if (*pages == NULL) {
+			/*
+			 * page queue mutex is recursable so this is OK
+			 * it would be really nice if we had an unlocked version of this so
+			 * we were only acquiring the pmap lock 1 time as opposed to potentially
+			 * many dozens of times
+			 */
+			m = pmap_extract_and_hold(pmap, va, prot);
+			if (m == NULL) {
+				faults++;
+				continue;
+			}
+			
+			*pages = m;
+			if (flags & VM_HOLD_WRITEABLE)
+				vm_page_dirty(m);
+		}
+	}
+	vm_page_unlock_queues();
+	
+	if (faults == 0) {
+		return (0);
+	}
+	
+	/*
+	 * Pages either have insufficient permissions or are not present
+	 * trigger a fault where neccessary
+	 * 
+	 */
+	for (va = addr; va < end; va += PAGE_SIZE) {
+		m = NULL;
+		pa = pmap_extract(pmap, va);
+		rv = 0;
+		if (pa)
+			m = PHYS_TO_VM_PAGE(pa);
+		if (flags & VM_HOLD_WRITEABLE) {
+			if (m == NULL  || (m->flags & PG_WRITEABLE) == 0)
+				rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
+		} else if (m == NULL)
+			rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
+		if (rv) {
+			printf("vm_fault bad return rv=%d va=0x%zx\n", rv, va);
+			
+			goto error;
+		} 
+	}
+	
+	goto retry;
+
+error:	
+	vm_page_lock_queues();
+	for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++)
+		if (*pages)
+			vm_page_unhold(*pages);
+	vm_page_unlock_queues();
+	return (EFAULT);
+}
+
+void
+vm_fault_unhold_pages(vm_page_t *mp, int count)
+{
+
+	KASSERT(count >= 0, ("negative count %d", count));
+	vm_page_lock_queues();
+	while (count--) {
+		vm_page_unhold(*mp);
+		mp++;
+	}
+	vm_page_unlock_queues();
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.h b/sys/dev/cxgb/ulp/tom/cxgb_vm.h
new file mode 100644
index 0000000000000..29418b616fd4f
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.h
@@ -0,0 +1,40 @@
+/**************************************************************************
+
+Copyright (c) 2007, Chelsio Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the Chelsio Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+$FreeBSD$
+
+***************************************************************************/
+#ifndef CXGB_VM_H_
+#define CXGB_VM_H_
+
+#define VM_HOLD_WRITEABLE	0x1
+
+int vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags);
+void vm_fault_unhold_pages(vm_page_t *mp, int count);
+
+#endif
diff --git a/sys/modules/cxgb/Makefile b/sys/modules/cxgb/Makefile
index 6e35a6ee0d86a..85c6f4875d5a1 100644
--- a/sys/modules/cxgb/Makefile
+++ b/sys/modules/cxgb/Makefile
@@ -1,5 +1,16 @@
 # $FreeBSD$
 SUBDIR= cxgb
+#SUBDIR+= toecore
+#SUBDIR+= tom
+#SUBDIR+= ${_iw_cxgb}
 SUBDIR+= cxgb_t3fw
 
+.if ${MACHINE_ARCH} == "i386"
+_iw_cxgb = iw_cxgb
+.endif
+
+.if ${MACHINE_ARCH} == "amd64"
+_iw_cxgb = iw_cxgb
+.endif
+
 .include <bsd.subdir.mk>
diff --git a/sys/modules/cxgb/cxgb/Makefile b/sys/modules/cxgb/cxgb/Makefile
index 039032da8f6a7..64044e86e4d00 100644
--- a/sys/modules/cxgb/cxgb/Makefile
+++ b/sys/modules/cxgb/cxgb/Makefile
@@ -4,15 +4,21 @@ CXGB = ${.CURDIR}/../../../dev/cxgb
 .PATH: ${CXGB} ${CXGB}/common ${CXGB}/sys
 
 KMOD=	if_cxgb
-SRCS=	cxgb_mc5.c cxgb_vsc8211.c cxgb_ael1002.c cxgb_mv88e1xxx.c 
+SRCS=   cxgb_mc5.c cxgb_vsc8211.c cxgb_ael1002.c cxgb_mv88e1xxx.c 
 SRCS+=	cxgb_xgmac.c cxgb_vsc7323.c cxgb_t3_hw.c cxgb_main.c 
-SRCS+=  cxgb_sge.c cxgb_lro.c cxgb_offload.c 
-SRCS+=	device_if.h bus_if.h pci_if.h opt_zero.h 
-SRCS+=	uipc_mvec.c cxgb_support.c cxgb_multiq.c
-
-CFLAGS+= -DCONFIG_CHELSIO_T3_CORE -g -DCONFIG_DEFINED -DDEFAULT_JUMBO -I${CXGB}
+SRCS+=  cxgb_sge.c cxgb_lro.c cxgb_offload.c cxgb_tn1010.c
+SRCS+=	device_if.h bus_if.h pci_if.h opt_zero.h opt_sched.h
+SRCS+=	uipc_mvec.c cxgb_support.c cxgb_multiq.c 
+CFLAGS+= -DCONFIG_CHELSIO_T3_CORE -g -DCONFIG_DEFINED -DDEFAULT_JUMBO -I${CXGB} -DSMP
+CFLAGS+= -DDISABLE_MBUF_IOVEC
+#CFLAGS+= -DIFNET_MULTIQUEUE
+#CFLAGS+= -DDISABLE_MBUF_IOVEC
 #CFLAGS+= -DDEBUG -DDEBUG_PRINT
+#CFLAGS+= -DINVARIANT_SUPPORT -DINVARIANTS 
+#CFLAGS+= -DWITNESS
+#CFLAGS += -DLOCK_PROFILING
+#CFLAGS+= -DWITNESS
 #CFLAGS+= -DINVARIANT_SUPPORT -DINVARIANTS -DWITNESS
 
 
-.include <bsd.kmod.mk>
-\ No newline at end of file
+.include <bsd.kmod.mk>
diff --git a/sys/modules/cxgb/cxgb_t3fw/Makefile b/sys/modules/cxgb/cxgb_t3fw/Makefile
index 787c9d41fbcc2..c35d73a41f025 100644
--- a/sys/modules/cxgb/cxgb_t3fw/Makefile
+++ b/sys/modules/cxgb/cxgb_t3fw/Makefile
@@ -3,6 +3,7 @@
 CXGB = ${.CURDIR}/../../../dev/cxgb
 .PATH: ${CXGB} 
 
+KMOD= cxgb_t3fw
 SRCS+= cxgb_t3fw.c
 
 .include <bsd.kmod.mk>
diff --git a/sys/modules/cxgb/iw_cxgb/Makefile b/sys/modules/cxgb/iw_cxgb/Makefile
new file mode 100644
index 0000000000000..e1123bba3faff
--- /dev/null
+++ b/sys/modules/cxgb/iw_cxgb/Makefile
@@ -0,0 +1,14 @@
+# $FreeBSD$
+
+CXGB = ${.CURDIR}/../../../dev/cxgb
+.PATH: ${IW_CXGB} ${CXGB}/common ${CXGB}/ulp/iw_cxgb
+
+KMOD=   iw_cxgb
+SRCS=   iw_cxgb.c iw_cxgb_cm.c iw_cxgb_hal.c 
+SRCS+=  iw_cxgb_provider.c iw_cxgb_qp.c iw_cxgb_resource.c
+SRCS+=  iw_cxgb_ev.c iw_cxgb_mem.c iw_cxgb_dbg.c iw_cxgb_cq.c
+SRCS+=  bus_if.h device_if.h opt_sched.h pci_if.h pcib_if.h opt_ktr.h
+CFLAGS+= -DCONFIG_CHELSIO_T3_CORE -g -DCONFIG_DEFINED  -I${CXGB} -DSMP
+#CFLAGS+= -DDEBUG
+
+.include <bsd.kmod.mk>
diff --git a/sys/modules/cxgb/toecore/Makefile b/sys/modules/cxgb/toecore/Makefile
new file mode 100644
index 0000000000000..1c05d799a5f88
--- /dev/null
+++ b/sys/modules/cxgb/toecore/Makefile
@@ -0,0 +1,8 @@
+# $FreeBSD$
+TOECORE = ${.CURDIR}/../../../dev/cxgb/ulp/toecore
+.PATH: ${TOECORE} 
+
+KMOD=	toecore
+SRCS=   toedev.c
+SRCS+=	device_if.h bus_if.h pci_if.h opt_sched.h
+.include <bsd.kmod.mk>
+\ No newline at end of file
diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile
new file mode 100644
index 0000000000000..2417edf1fc40e
--- /dev/null
+++ b/sys/modules/cxgb/tom/Makefile
@@ -0,0 +1,14 @@
+# $FreeBSD$
+
+TOM = ${.CURDIR}/../../../dev/cxgb/ulp/tom
+.PATH: ${TOM} 
+
+KMOD=	tom
+SRCS=   cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c
+SRCS+=  cxgb_ddp.c cxgb_vm.c cxgb_l2t.c cxgb_tcp_offload.c
+SRCS+=	opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h opt_mac.h 
+SRCS+=	opt_tcpdebug.h opt_ddb.h opt_sched.h opt_global.h opt_ktr.h
+SRCS+=	device_if.h bus_if.h pci_if.h
+
+#CFLAGS+= -DDEBUG_PRINT -DDEBUG
+.include <bsd.kmod.mk>
author	Kip Macy <kmacy@FreeBSD.org>	2008-07-28 23:37:33 +0000
committer	Kip Macy <kmacy@FreeBSD.org>	2008-07-28 23:37:33 +0000
commit	6971fe8ddf2f0e170067a422e5f827724410bef9 (patch)
tree	8fd6cc6e7404202400d3d5f758a8f3b65766b0f4
parent	3ccd11b631cb9868dc43b7d5c815100a17bd8d9e (diff)