diff options
author | Kip Macy <kmacy@FreeBSD.org> | 2008-07-28 23:37:33 +0000 |
---|---|---|
committer | Kip Macy <kmacy@FreeBSD.org> | 2008-07-28 23:37:33 +0000 |
commit | 6971fe8ddf2f0e170067a422e5f827724410bef9 (patch) | |
tree | 8fd6cc6e7404202400d3d5f758a8f3b65766b0f4 | |
parent | 3ccd11b631cb9868dc43b7d5c815100a17bd8d9e (diff) |
Notes
59 files changed, 20553 insertions, 199 deletions
diff --git a/sys/conf/files b/sys/conf/files index f71067411abd9..9672a92f10340 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -527,6 +527,7 @@ dev/cxgb/common/cxgb_ael1002.c optional cxgb pci dev/cxgb/common/cxgb_mv88e1xxx.c optional cxgb pci dev/cxgb/common/cxgb_xgmac.c optional cxgb pci dev/cxgb/common/cxgb_t3_hw.c optional cxgb pci +dev/cxgb/common/cxgb_tn1010.c optional cxgb pci dev/cxgb/sys/uipc_mvec.c optional cxgb pci dev/cxgb/sys/cxgb_support.c optional cxgb pci dev/cxgb/cxgb_t3fw.c optional cxgb cxgb_t3fw diff --git a/sys/dev/cxgb/common/cxgb_ael1002.c b/sys/dev/cxgb/common/cxgb_ael1002.c index b288d5d60535a..a9c7fb2d86770 100644 --- a/sys/dev/cxgb/common/cxgb_ael1002.c +++ b/sys/dev/cxgb/common/cxgb_ael1002.c @@ -46,11 +46,32 @@ enum { AEL1002_PWR_DOWN_LO = 0xc012, AEL1002_XFI_EQL = 0xc015, AEL1002_LB_EN = 0xc017, + AEL_OPT_SETTINGS = 0xc017, +}; - LASI_CTRL = 0x9002, - LASI_STAT = 0x9005 +struct reg_val { + unsigned short mmd_addr; + unsigned short reg_addr; + unsigned short clear_bits; + unsigned short set_bits; }; +static int set_phy_regs(struct cphy *phy, const struct reg_val *rv) +{ + int err; + + for (err = 0; rv->mmd_addr && !err; rv++) { + if (rv->clear_bits == 0xffff) + err = mdio_write(phy, rv->mmd_addr, rv->reg_addr, + rv->set_bits); + else + err = t3_mdio_change_bits(phy, rv->mmd_addr, + rv->reg_addr, rv->clear_bits, + rv->set_bits); + } + return err; +} + static void ael100x_txon(struct cphy *phy) { int tx_on_gpio = phy->addr == 0 ? F_GPIO7_OUT_VAL : F_GPIO2_OUT_VAL; @@ -158,33 +179,6 @@ static int ael1006_reset(struct cphy *phy, int wait) return t3_phy_reset(phy, MDIO_DEV_PMA_PMD, wait); } -static int ael1006_intr_enable(struct cphy *phy) -{ - return mdio_write(phy, MDIO_DEV_PMA_PMD, LASI_CTRL, 1); -} - -static int ael1006_intr_disable(struct cphy *phy) -{ - return mdio_write(phy, MDIO_DEV_PMA_PMD, LASI_CTRL, 0); -} - -static int ael1006_intr_clear(struct cphy *phy) -{ - u32 val; - - return mdio_read(phy, MDIO_DEV_PMA_PMD, LASI_STAT, &val); -} - -static int ael1006_intr_handler(struct cphy *phy) -{ - unsigned int status; - int err = mdio_read(phy, MDIO_DEV_PMA_PMD, LASI_STAT, &status); - - if (err) - return err; - return (status & 1) ? cphy_cause_link_change : 0; -} - static int ael1006_power_down(struct cphy *phy, int enable) { return t3_mdio_change_bits(phy, MDIO_DEV_PMA_PMD, MII_BMCR, @@ -194,10 +188,10 @@ static int ael1006_power_down(struct cphy *phy, int enable) #ifdef C99_NOT_SUPPORTED static struct cphy_ops ael1006_ops = { ael1006_reset, - ael1006_intr_enable, - ael1006_intr_disable, - ael1006_intr_clear, - ael1006_intr_handler, + t3_phy_lasi_intr_enable, + t3_phy_lasi_intr_disable, + t3_phy_lasi_intr_clear, + t3_phy_lasi_intr_handler, NULL, NULL, NULL, @@ -209,10 +203,10 @@ static struct cphy_ops ael1006_ops = { #else static struct cphy_ops ael1006_ops = { .reset = ael1006_reset, - .intr_enable = ael1006_intr_enable, - .intr_disable = ael1006_intr_disable, - .intr_clear = ael1006_intr_clear, - .intr_handler = ael1006_intr_handler, + .intr_enable = t3_phy_lasi_intr_enable, + .intr_disable = t3_phy_lasi_intr_disable, + .intr_clear = t3_phy_lasi_intr_clear, + .intr_handler = t3_phy_lasi_intr_handler, .get_link_status = ael100x_get_link_status, .power_down = ael1006_power_down, }; @@ -228,13 +222,382 @@ int t3_ael1006_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, return 0; } +static int ael2005_setup_sr_edc(struct cphy *phy) +{ + static u16 sr_edc[] = { + 0xcc00, 0x2ff4, + 0xcc01, 0x3cd4, + 0xcc02, 0x2015, + 0xcc03, 0x3105, + 0xcc04, 0x6524, + 0xcc05, 0x27ff, + 0xcc06, 0x300f, + 0xcc07, 0x2c8b, + 0xcc08, 0x300b, + 0xcc09, 0x4009, + 0xcc0a, 0x400e, + 0xcc0b, 0x2f72, + 0xcc0c, 0x3002, + 0xcc0d, 0x1002, + 0xcc0e, 0x2172, + 0xcc0f, 0x3012, + 0xcc10, 0x1002, + 0xcc11, 0x25d2, + 0xcc12, 0x3012, + 0xcc13, 0x1002, + 0xcc14, 0xd01e, + 0xcc15, 0x27d2, + 0xcc16, 0x3012, + 0xcc17, 0x1002, + 0xcc18, 0x2004, + 0xcc19, 0x3c84, + 0xcc1a, 0x6436, + 0xcc1b, 0x2007, + 0xcc1c, 0x3f87, + 0xcc1d, 0x8676, + 0xcc1e, 0x40b7, + 0xcc1f, 0xa746, + 0xcc20, 0x4047, + 0xcc21, 0x5673, + 0xcc22, 0x2982, + 0xcc23, 0x3002, + 0xcc24, 0x13d2, + 0xcc25, 0x8bbd, + 0xcc26, 0x2862, + 0xcc27, 0x3012, + 0xcc28, 0x1002, + 0xcc29, 0x2092, + 0xcc2a, 0x3012, + 0xcc2b, 0x1002, + 0xcc2c, 0x5cc3, + 0xcc2d, 0x314, + 0xcc2e, 0x2942, + 0xcc2f, 0x3002, + 0xcc30, 0x1002, + 0xcc31, 0xd019, + 0xcc32, 0x2032, + 0xcc33, 0x3012, + 0xcc34, 0x1002, + 0xcc35, 0x2a04, + 0xcc36, 0x3c74, + 0xcc37, 0x6435, + 0xcc38, 0x2fa4, + 0xcc39, 0x3cd4, + 0xcc3a, 0x6624, + 0xcc3b, 0x5563, + 0xcc3c, 0x2d42, + 0xcc3d, 0x3002, + 0xcc3e, 0x13d2, + 0xcc3f, 0x464d, + 0xcc40, 0x2862, + 0xcc41, 0x3012, + 0xcc42, 0x1002, + 0xcc43, 0x2032, + 0xcc44, 0x3012, + 0xcc45, 0x1002, + 0xcc46, 0x2fb4, + 0xcc47, 0x3cd4, + 0xcc48, 0x6624, + 0xcc49, 0x5563, + 0xcc4a, 0x2d42, + 0xcc4b, 0x3002, + 0xcc4c, 0x13d2, + 0xcc4d, 0x2ed2, + 0xcc4e, 0x3002, + 0xcc4f, 0x1002, + 0xcc50, 0x2fd2, + 0xcc51, 0x3002, + 0xcc52, 0x1002, + 0xcc53, 0x004, + 0xcc54, 0x2942, + 0xcc55, 0x3002, + 0xcc56, 0x1002, + 0xcc57, 0x2092, + 0xcc58, 0x3012, + 0xcc59, 0x1002, + 0xcc5a, 0x5cc3, + 0xcc5b, 0x317, + 0xcc5c, 0x2f72, + 0xcc5d, 0x3002, + 0xcc5e, 0x1002, + 0xcc5f, 0x2942, + 0xcc60, 0x3002, + 0xcc61, 0x1002, + 0xcc62, 0x22cd, + 0xcc63, 0x301d, + 0xcc64, 0x2862, + 0xcc65, 0x3012, + 0xcc66, 0x1002, + 0xcc67, 0x2ed2, + 0xcc68, 0x3002, + 0xcc69, 0x1002, + 0xcc6a, 0x2d72, + 0xcc6b, 0x3002, + 0xcc6c, 0x1002, + 0xcc6d, 0x628f, + 0xcc6e, 0x2112, + 0xcc6f, 0x3012, + 0xcc70, 0x1002, + 0xcc71, 0x5aa3, + 0xcc72, 0x2dc2, + 0xcc73, 0x3002, + 0xcc74, 0x1312, + 0xcc75, 0x6f72, + 0xcc76, 0x1002, + 0xcc77, 0x2807, + 0xcc78, 0x31a7, + 0xcc79, 0x20c4, + 0xcc7a, 0x3c24, + 0xcc7b, 0x6724, + 0xcc7c, 0x1002, + 0xcc7d, 0x2807, + 0xcc7e, 0x3187, + 0xcc7f, 0x20c4, + 0xcc80, 0x3c24, + 0xcc81, 0x6724, + 0xcc82, 0x1002, + 0xcc83, 0x2514, + 0xcc84, 0x3c64, + 0xcc85, 0x6436, + 0xcc86, 0xdff4, + 0xcc87, 0x6436, + 0xcc88, 0x1002, + 0xcc89, 0x40a4, + 0xcc8a, 0x643c, + 0xcc8b, 0x4016, + 0xcc8c, 0x8c6c, + 0xcc8d, 0x2b24, + 0xcc8e, 0x3c24, + 0xcc8f, 0x6435, + 0xcc90, 0x1002, + 0xcc91, 0x2b24, + 0xcc92, 0x3c24, + 0xcc93, 0x643a, + 0xcc94, 0x4025, + 0xcc95, 0x8a5a, + 0xcc96, 0x1002, + 0xcc97, 0x2731, + 0xcc98, 0x3011, + 0xcc99, 0x1001, + 0xcc9a, 0xc7a0, + 0xcc9b, 0x100, + 0xcc9c, 0xc502, + 0xcc9d, 0x53ac, + 0xcc9e, 0xc503, + 0xcc9f, 0xd5d5, + 0xcca0, 0xc600, + 0xcca1, 0x2a6d, + 0xcca2, 0xc601, + 0xcca3, 0x2a4c, + 0xcca4, 0xc602, + 0xcca5, 0x111, + 0xcca6, 0xc60c, + 0xcca7, 0x5900, + 0xcca8, 0xc710, + 0xcca9, 0x700, + 0xccaa, 0xc718, + 0xccab, 0x700, + 0xccac, 0xc720, + 0xccad, 0x4700, + 0xccae, 0xc801, + 0xccaf, 0x7f50, + 0xccb0, 0xc802, + 0xccb1, 0x7760, + 0xccb2, 0xc803, + 0xccb3, 0x7fce, + 0xccb4, 0xc804, + 0xccb5, 0x5700, + 0xccb6, 0xc805, + 0xccb7, 0x5f11, + 0xccb8, 0xc806, + 0xccb9, 0x4751, + 0xccba, 0xc807, + 0xccbb, 0x57e1, + 0xccbc, 0xc808, + 0xccbd, 0x2700, + 0xccbe, 0xc809, + 0xccbf, 0x000, + 0xccc0, 0xc821, + 0xccc1, 0x002, + 0xccc2, 0xc822, + 0xccc3, 0x014, + 0xccc4, 0xc832, + 0xccc5, 0x1186, + 0xccc6, 0xc847, + 0xccc7, 0x1e02, + 0xccc8, 0xc013, + 0xccc9, 0xf341, + 0xccca, 0xc01a, + 0xcccb, 0x446, + 0xcccc, 0xc024, + 0xcccd, 0x1000, + 0xccce, 0xc025, + 0xcccf, 0xa00, + 0xccd0, 0xc026, + 0xccd1, 0xc0c, + 0xccd2, 0xc027, + 0xccd3, 0xc0c, + 0xccd4, 0xc029, + 0xccd5, 0x0a0, + 0xccd6, 0xc030, + 0xccd7, 0xa00, + 0xccd8, 0xc03c, + 0xccd9, 0x01c, + 0xccda, 0xc005, + 0xccdb, 0x7a06, + 0xccdc, 0x000, + 0xccdd, 0x2731, + 0xccde, 0x3011, + 0xccdf, 0x1001, + 0xcce0, 0xc620, + 0xcce1, 0x000, + 0xcce2, 0xc621, + 0xcce3, 0x03f, + 0xcce4, 0xc622, + 0xcce5, 0x000, + 0xcce6, 0xc623, + 0xcce7, 0x000, + 0xcce8, 0xc624, + 0xcce9, 0x000, + 0xccea, 0xc625, + 0xcceb, 0x000, + 0xccec, 0xc627, + 0xcced, 0x000, + 0xccee, 0xc628, + 0xccef, 0x000, + 0xccf0, 0xc62c, + 0xccf1, 0x000, + 0xccf2, 0x000, + 0xccf3, 0x2806, + 0xccf4, 0x3cb6, + 0xccf5, 0xc161, + 0xccf6, 0x6134, + 0xccf7, 0x6135, + 0xccf8, 0x5443, + 0xccf9, 0x303, + 0xccfa, 0x6524, + 0xccfb, 0x00b, + 0xccfc, 0x1002, + 0xccfd, 0x2104, + 0xccfe, 0x3c24, + 0xccff, 0x2105, + 0xcd00, 0x3805, + 0xcd01, 0x6524, + 0xcd02, 0xdff4, + 0xcd03, 0x4005, + 0xcd04, 0x6524, + 0xcd05, 0x1002, + 0xcd06, 0x5dd3, + 0xcd07, 0x306, + 0xcd08, 0x2ff7, + 0xcd09, 0x38f7, + 0xcd0a, 0x60b7, + 0xcd0b, 0xdffd, + 0xcd0c, 0x00a, + 0xcd0d, 0x1002, + 0xcd0e, 0 + }; + int i, err; + + for (err = i = 0; i < ARRAY_SIZE(sr_edc) && !err; i += 2) + err = mdio_write(phy, MDIO_DEV_PMA_PMD, sr_edc[i], + sr_edc[i + 1]); + return err; +} + +static int ael2005_reset(struct cphy *phy, int wait) +{ + static struct reg_val regs0[] = { + { MDIO_DEV_PMA_PMD, 0xc001, 0, 1 << 5 }, + { MDIO_DEV_PMA_PMD, 0xc017, 0, 1 << 5 }, + { MDIO_DEV_PMA_PMD, 0xc013, 0xffff, 0xf341 }, + { MDIO_DEV_PMA_PMD, 0xc210, 0xffff, 0x8000 }, + { MDIO_DEV_PMA_PMD, 0xc210, 0xffff, 0x8100 }, + { MDIO_DEV_PMA_PMD, 0xc210, 0xffff, 0x8000 }, + { MDIO_DEV_PMA_PMD, 0xc210, 0xffff, 0 }, + { 0, 0, 0, 0 } + }; + static struct reg_val regs1[] = { + { MDIO_DEV_PMA_PMD, 0xc003, 0xffff, 0x181 }, + { MDIO_DEV_PMA_PMD, 0xc010, 0xffff, 0x448a }, + { MDIO_DEV_PMA_PMD, 0xc04a, 0xffff, 0x5200 }, + { 0, 0, 0, 0 } + }; + static struct reg_val regs2[] = { + { MDIO_DEV_PMA_PMD, 0xca00, 0xffff, 0x0080 }, + { MDIO_DEV_PMA_PMD, 0xca12, 0xffff, 0 }, + { 0, 0, 0, 0 } + }; + + int err; + + err = t3_phy_reset(phy, MDIO_DEV_PMA_PMD, 0); + if (err) + return err; + + msleep(125); + err = set_phy_regs(phy, regs0); + if (err) + return err; + + msleep(50); + err = set_phy_regs(phy, regs1); + if (err) + return err; + + msleep(50); + err = ael2005_setup_sr_edc(phy); + if (err) + return err; + + return set_phy_regs(phy, regs2); +} + +#ifdef C99_NOT_SUPPORTED +static struct cphy_ops ael2005_ops = { + ael2005_reset, + t3_phy_lasi_intr_enable, + t3_phy_lasi_intr_disable, + t3_phy_lasi_intr_clear, + t3_phy_lasi_intr_handler, + NULL, + NULL, + NULL, + NULL, + NULL, + ael100x_get_link_status, + ael1002_power_down, +}; +#else +static struct cphy_ops ael2005_ops = { + .reset = ael2005_reset, + .intr_enable = t3_phy_lasi_intr_enable, + .intr_disable = t3_phy_lasi_intr_disable, + .intr_clear = t3_phy_lasi_intr_clear, + .intr_handler = t3_phy_lasi_intr_handler, + .get_link_status = ael100x_get_link_status, + .power_down = ael1002_power_down, +}; +#endif + +int t3_ael2005_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, + const struct mdio_ops *mdio_ops) +{ + cphy_init(phy, adapter, phy_addr, &ael2005_ops, mdio_ops, + SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE, + "10GBASE-R"); + msleep(125); + return t3_mdio_change_bits(phy, MDIO_DEV_PMA_PMD, AEL_OPT_SETTINGS, 0, + 1 << 5); +} + #ifdef C99_NOT_SUPPORTED static struct cphy_ops qt2045_ops = { ael1006_reset, - ael1006_intr_enable, - ael1006_intr_disable, - ael1006_intr_clear, - ael1006_intr_handler, + t3_phy_lasi_intr_enable, + t3_phy_lasi_intr_disable, + t3_phy_lasi_intr_clear, + t3_phy_lasi_intr_handler, NULL, NULL, NULL, @@ -246,10 +609,10 @@ static struct cphy_ops qt2045_ops = { #else static struct cphy_ops qt2045_ops = { .reset = ael1006_reset, - .intr_enable = ael1006_intr_enable, - .intr_disable = ael1006_intr_disable, - .intr_clear = ael1006_intr_clear, - .intr_handler = ael1006_intr_handler, + .intr_enable = t3_phy_lasi_intr_enable, + .intr_disable = t3_phy_lasi_intr_disable, + .intr_clear = t3_phy_lasi_intr_clear, + .intr_handler = t3_phy_lasi_intr_handler, .get_link_status = ael100x_get_link_status, .power_down = ael1006_power_down, }; diff --git a/sys/dev/cxgb/common/cxgb_common.h b/sys/dev/cxgb/common/cxgb_common.h index 1ce6b4016b493..9ac28945533a2 100644 --- a/sys/dev/cxgb/common/cxgb_common.h +++ b/sys/dev/cxgb/common/cxgb_common.h @@ -47,10 +47,7 @@ enum { NCCTRL_WIN = 32, /* # of congestion control windows */ NTX_SCHED = 8, /* # of HW Tx scheduling queues */ PROTO_SRAM_LINES = 128, /* size of protocol sram */ - MAX_NPORTS = 4, - TP_TMR_RES = 200, - TP_SRAM_OFFSET = 4096, /* TP SRAM content offset in eeprom */ - TP_SRAM_LEN = 2112, /* TP SRAM content offset in eeprom */ + EXACT_ADDR_FILTERS = 8, /* # of HW exact match filters */ }; #define MAX_RX_COALESCING_LEN 12288U @@ -122,8 +119,8 @@ enum { }; struct sg_ent { /* SGE scatter/gather entry */ - u32 len[2]; - u64 addr[2]; + __be32 len[2]; + __be64 addr[2]; }; #ifndef SGE_NUM_GENBITS @@ -150,7 +147,7 @@ struct adapter_info { unsigned char mdien:1; unsigned char mdiinv:1; unsigned int gpio_out; /* GPIO output settings */ - unsigned int gpio_intr; /* GPIO IRQ enable mask */ + unsigned char gpio_intr[MAX_NPORTS]; /* GPIO PHY IRQ pins */ unsigned long caps; /* adapter capabilities */ const struct mdio_ops *mdio_ops; /* MDIO operations */ const char *desc; /* product description */ @@ -159,8 +156,6 @@ struct adapter_info { struct port_type_info { int (*phy_prep)(struct cphy *phy, adapter_t *adapter, int phy_addr, const struct mdio_ops *ops); - - }; struct mc5_stats { @@ -307,7 +302,7 @@ struct tp_params { struct qset_params { /* SGE queue set parameters */ unsigned int polling; /* polling/interrupt service for rspq */ unsigned int lro; /* large receive offload */ - unsigned int coalesce_nsecs; /* irq coalescing timer */ + unsigned int coalesce_usecs; /* irq coalescing timer */ unsigned int rspq_size; /* # of entries in response queue */ unsigned int fl_size; /* # of entries in regular free list */ unsigned int jumbo_size; /* # of entries in jumbo free list */ @@ -486,12 +481,25 @@ enum { MAC_RXFIFO_SIZE = 32768 }; -/* IEEE 802.3ae specified MDIO devices */ +/* IEEE 802.3 specified MDIO devices */ enum { MDIO_DEV_PMA_PMD = 1, MDIO_DEV_WIS = 2, MDIO_DEV_PCS = 3, - MDIO_DEV_XGXS = 4 + MDIO_DEV_XGXS = 4, + MDIO_DEV_ANEG = 7, + MDIO_DEV_VEND1 = 30, + MDIO_DEV_VEND2 = 31 +}; + +/* LASI control and status registers */ +enum { + RX_ALARM_CTRL = 0x9000, + TX_ALARM_CTRL = 0x9001, + LASI_CTRL = 0x9002, + RX_ALARM_STAT = 0x9003, + TX_ALARM_STAT = 0x9004, + LASI_STAT = 0x9005 }; /* PHY loopback direction */ @@ -556,8 +564,8 @@ static inline int mdio_write(struct cphy *phy, int mmd, int reg, /* Convenience initializer */ static inline void cphy_init(struct cphy *phy, adapter_t *adapter, int phy_addr, struct cphy_ops *phy_ops, - const struct mdio_ops *mdio_ops, unsigned int caps, - const char *desc) + const struct mdio_ops *mdio_ops, unsigned int caps, + const char *desc) { phy->adapter = adapter; phy->addr = phy_addr; @@ -651,7 +659,12 @@ int t3_mdio_change_bits(struct cphy *phy, int mmd, int reg, unsigned int clear, unsigned int set); int t3_phy_reset(struct cphy *phy, int mmd, int wait); int t3_phy_advertise(struct cphy *phy, unsigned int advert); +int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert); int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex); +int t3_phy_lasi_intr_enable(struct cphy *phy); +int t3_phy_lasi_intr_disable(struct cphy *phy); +int t3_phy_lasi_intr_clear(struct cphy *phy); +int t3_phy_lasi_intr_handler(struct cphy *phy); void t3_intr_enable(adapter_t *adapter); void t3_intr_disable(adapter_t *adapter); @@ -673,10 +686,10 @@ int t3_read_flash(adapter_t *adapter, unsigned int addr, unsigned int nwords, int t3_get_tp_version(adapter_t *adapter, u32 *vers); int t3_check_tpsram_version(adapter_t *adapter, int *must_load); int t3_check_tpsram(adapter_t *adapter, const u8 *tp_ram, unsigned int size); -int t3_load_fw(adapter_t *adapter, const const u8 *fw_data, unsigned int size); -int t3_load_boot(adapter_t *adapter, u8 *boot_data, unsigned int size); +int t3_load_fw(adapter_t *adapter, const u8 *fw_data, unsigned int size); int t3_get_fw_version(adapter_t *adapter, u32 *vers); int t3_check_fw_version(adapter_t *adapter, int *must_load); +int t3_load_boot(adapter_t *adapter, u8 *fw_data, unsigned int size); int t3_init_hw(adapter_t *adapter, u32 fw_params); void mac_prep(struct cmac *mac, adapter_t *adapter, int index); void early_hw_init(adapter_t *adapter, const struct adapter_info *ai); @@ -684,8 +697,8 @@ int t3_prep_adapter(adapter_t *adapter, const struct adapter_info *ai, int reset void t3_led_ready(adapter_t *adapter); void t3_fatal_err(adapter_t *adapter); void t3_set_vlan_accel(adapter_t *adapter, unsigned int ports, int on); -void t3_tp_set_offload_mode(adapter_t *adap, int enable); void t3_enable_filters(adapter_t *adap); +void t3_tp_set_offload_mode(adapter_t *adap, int enable); void t3_config_rss(adapter_t *adapter, unsigned int rss_config, const u8 *cpus, const u16 *rspq); int t3_read_rss(adapter_t *adapter, u8 *lkup, u16 *map); @@ -719,7 +732,7 @@ void t3_mc5_intr_handler(struct mc5 *mc5); int t3_read_mc5_range(const struct mc5 *mc5, unsigned int start, unsigned int n, u32 *buf); -#if defined(CONFIG_CHELSIO_T3_CORE) +#ifdef CONFIG_CHELSIO_T3_CORE int t3_tp_set_coalescing_size(adapter_t *adap, unsigned int size, int psh); void t3_tp_set_max_rxsize(adapter_t *adap, unsigned int size); void t3_tp_get_mib_stats(adapter_t *adap, struct tp_mib_stats *tps); @@ -774,21 +787,22 @@ int t3_vsc7323_set_mtu(adapter_t *adap, unsigned int mtu, int port); int t3_vsc7323_set_addr(adapter_t *adap, u8 addr[6], int port); int t3_vsc7323_enable(adapter_t *adap, int port, int which); int t3_vsc7323_disable(adapter_t *adap, int port, int which); - -int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert); - const struct mac_stats *t3_vsc7323_update_stats(struct cmac *mac); int t3_mv88e1xxx_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, - const struct mdio_ops *mdio_ops); + const struct mdio_ops *mdio_ops); int t3_vsc8211_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, - const struct mdio_ops *mdio_ops); + const struct mdio_ops *mdio_ops); int t3_ael1002_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, - const struct mdio_ops *mdio_ops); + const struct mdio_ops *mdio_ops); int t3_ael1006_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, - const struct mdio_ops *mdio_ops); -int t3_qt2045_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, const struct mdio_ops *mdio_ops); +int t3_ael2005_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, + const struct mdio_ops *mdio_ops); +int t3_qt2045_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, + const struct mdio_ops *mdio_ops); +int t3_tn1010_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, + const struct mdio_ops *mdio_ops); int t3_xaui_direct_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, - const struct mdio_ops *mdio_ops); + const struct mdio_ops *mdio_ops); #endif /* __CHELSIO_COMMON_H */ diff --git a/sys/dev/cxgb/common/cxgb_mc5.c b/sys/dev/cxgb/common/cxgb_mc5.c index 0e40aca8880e0..6f1537c680843 100644 --- a/sys/dev/cxgb/common/cxgb_mc5.c +++ b/sys/dev/cxgb/common/cxgb_mc5.c @@ -326,9 +326,16 @@ static void mc5_dbgi_mode_disable(const struct mc5 *mc5) V_PRTYEN(mc5->parity_enabled) | F_MBUSEN); } -/* - * Initialization that requires the OS and protocol layers to already - * be intialized goes here. +/** + * t3_mc5_init - initialize MC5 and the TCAM + * @mc5: the MC5 handle + * @nservers: desired number the TCP servers (listening ports) + * @nfilters: desired number of HW filters (classifiers) + * @nroutes: desired number of routes + * + * Initialize MC5 and the TCAM and partition the TCAM for the requested + * number of servers, filters, and routes. The number of routes is + * typically 0 except for specialized uses of the T3 adapters. */ int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters, unsigned int nroutes) @@ -344,7 +351,7 @@ int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters, if (nroutes > MAX_ROUTES || nroutes + nservers + nfilters > tcam_size) return -EINVAL; - if (nfilters && adap->params.rev < T3_REV_C) + if (nfilters) mc5->parity_enabled = 0; /* Reset the TCAM */ @@ -420,7 +427,7 @@ int t3_read_mc5_range(const struct mc5 *mc5, unsigned int start, } mc5_dbgi_mode_disable(mc5); - return 0; + return err; } #define MC5_INT_FATAL (F_PARITYERR | F_REQQPARERR | F_DISPQPARERR) @@ -465,7 +472,6 @@ void t3_mc5_intr_handler(struct mc5 *mc5) t3_write_reg(adap, A_MC5_DB_INT_CAUSE, cause); } - /** * t3_mc5_prep - initialize the SW state for MC5 * @adapter: the adapter diff --git a/sys/dev/cxgb/common/cxgb_mv88e1xxx.c b/sys/dev/cxgb/common/cxgb_mv88e1xxx.c index 8777b82b2f05f..ab8cce7fdc043 100644 --- a/sys/dev/cxgb/common/cxgb_mv88e1xxx.c +++ b/sys/dev/cxgb/common/cxgb_mv88e1xxx.c @@ -299,7 +299,7 @@ static struct cphy_ops mv88e1xxx_ops = { #endif int t3_mv88e1xxx_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, - const struct mdio_ops *mdio_ops) + const struct mdio_ops *mdio_ops) { int err; @@ -310,9 +310,9 @@ int t3_mv88e1xxx_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, /* Configure copper PHY transmitter as class A to reduce EMI. */ err = mdio_write(phy, 0, MV88E1XXX_EXTENDED_ADDR, 0xb); - if (!err) err = mdio_write(phy, 0, MV88E1XXX_EXTENDED_DATA, 0x8004); + if (!err) err = mv88e1xxx_downshift_set(phy, 1); /* Enable downshift */ return err; diff --git a/sys/dev/cxgb/common/cxgb_t3_cpl.h b/sys/dev/cxgb/common/cxgb_t3_cpl.h index dd245712cd653..7cd219d222579 100644 --- a/sys/dev/cxgb/common/cxgb_t3_cpl.h +++ b/sys/dev/cxgb/common/cxgb_t3_cpl.h @@ -103,6 +103,7 @@ enum CPL_opcode { CPL_RDMA_TERMINATE = 0xA2, CPL_TRACE_PKT = 0xA3, CPL_RDMA_EC_STATUS = 0xA5, + CPL_SGE_EC_CR_RETURN = 0xA6, NUM_CPL_CMDS /* must be last and previous entries must be sorted */ }; @@ -148,7 +149,8 @@ enum { enum { CPL_PASS_OPEN_ACCEPT, - CPL_PASS_OPEN_REJECT + CPL_PASS_OPEN_REJECT, + CPL_PASS_OPEN_ACCEPT_TNL }; enum { @@ -907,6 +909,14 @@ struct cpl_wr_ack { __be32 snd_una; }; +struct cpl_sge_ec_cr_return { + RSS_HDR + union opcode_tid ot; + __be16 sge_ec_id; + __u8 cr; + __u8 rsvd; +}; + struct cpl_rdma_ec_status { RSS_HDR union opcode_tid ot; @@ -959,9 +969,11 @@ struct cpl_rx_data { __u8 dack_mode:2; __u8 psh:1; __u8 heartbeat:1; - __u8 :4; + __u8 ddp_off:1; + __u8 :3; #else - __u8 :4; + __u8 :3; + __u8 ddp_off:1; __u8 heartbeat:1; __u8 psh:1; __u8 dack_mode:2; @@ -1129,6 +1141,17 @@ struct cpl_tx_pkt { __be32 len; }; +struct cpl_tx_pkt_coalesce { + __be32 cntrl; + __be32 len; + __be64 addr; +}; + +struct tx_pkt_coalesce_wr { + WR_HDR; + struct cpl_tx_pkt_coalesce cpl[0]; +}; + struct cpl_tx_pkt_lso { WR_HDR; __be32 cntrl; @@ -1265,7 +1288,8 @@ struct cpl_l2t_write_req { WR_HDR; union opcode_tid ot; __be32 params; - __u8 rsvd[2]; + __u8 rsvd; + __u8 port_idx; __u8 dst_mac[6]; }; diff --git a/sys/dev/cxgb/common/cxgb_t3_hw.c b/sys/dev/cxgb/common/cxgb_t3_hw.c index 29fc328223d6e..acd41c034c571 100644 --- a/sys/dev/cxgb/common/cxgb_t3_hw.c +++ b/sys/dev/cxgb/common/cxgb_t3_hw.c @@ -460,32 +460,57 @@ int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex) return mdio_write(phy, 0, MII_BMCR, ctl); } +int t3_phy_lasi_intr_enable(struct cphy *phy) +{ + return mdio_write(phy, MDIO_DEV_PMA_PMD, LASI_CTRL, 1); +} + +int t3_phy_lasi_intr_disable(struct cphy *phy) +{ + return mdio_write(phy, MDIO_DEV_PMA_PMD, LASI_CTRL, 0); +} + +int t3_phy_lasi_intr_clear(struct cphy *phy) +{ + u32 val; + + return mdio_read(phy, MDIO_DEV_PMA_PMD, LASI_STAT, &val); +} + +int t3_phy_lasi_intr_handler(struct cphy *phy) +{ + unsigned int status; + int err = mdio_read(phy, MDIO_DEV_PMA_PMD, LASI_STAT, &status); + + if (err) + return err; + return (status & 1) ? cphy_cause_link_change : 0; +} + static struct adapter_info t3_adap_info[] = { { 1, 1, 0, 0, 0, F_GPIO2_OEN | F_GPIO4_OEN | - F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, F_GPIO3 | F_GPIO5, - 0, + F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0, &mi1_mdio_ops, "Chelsio PE9000" }, { 1, 1, 0, 0, 0, F_GPIO2_OEN | F_GPIO4_OEN | - F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, F_GPIO3 | F_GPIO5, - 0, + F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0, &mi1_mdio_ops, "Chelsio T302" }, { 1, 0, 0, 0, 0, F_GPIO1_OEN | F_GPIO6_OEN | F_GPIO7_OEN | F_GPIO10_OEN | F_GPIO11_OEN | F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL, - 0, SUPPORTED_10000baseT_Full | SUPPORTED_AUI, + { 0 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI, &mi1_mdio_ext_ops, "Chelsio T310" }, { 1, 1, 0, 0, 0, F_GPIO1_OEN | F_GPIO2_OEN | F_GPIO4_OEN | F_GPIO5_OEN | F_GPIO6_OEN | F_GPIO7_OEN | F_GPIO10_OEN | F_GPIO11_OEN | F_GPIO1_OUT_VAL | - F_GPIO5_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL, 0, - SUPPORTED_10000baseT_Full | SUPPORTED_AUI, + F_GPIO5_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL, + { S_GPIO9, S_GPIO3 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI, &mi1_mdio_ext_ops, "Chelsio T320" }, { 4, 0, 0, 0, 0, F_GPIO5_OEN | F_GPIO6_OEN | F_GPIO7_OEN | F_GPIO5_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO7_OUT_VAL, - F_GPIO1 | F_GPIO2 | F_GPIO3 | F_GPIO4, SUPPORTED_AUI, + { S_GPIO1, S_GPIO2, S_GPIO3, S_GPIO4 }, SUPPORTED_AUI, &mi1_mdio_ops, "Chelsio T304" }, }; @@ -504,10 +529,10 @@ static struct port_type_info port_types[] = { { t3_vsc8211_phy_prep }, { t3_mv88e1xxx_phy_prep }, { t3_xaui_direct_phy_prep }, - { NULL }, + { t3_ael2005_phy_prep }, { t3_qt2045_phy_prep }, { t3_ael1006_phy_prep }, - { NULL }, + { t3_tn1010_phy_prep }, }; #define VPD_ENTRY(name, len) \ @@ -1231,6 +1256,15 @@ void t3_link_changed(adapter_t *adapter, int port_id) phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc); + if (lc->requested_fc & PAUSE_AUTONEG) + fc &= lc->requested_fc; + else + fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX); + + if (link_ok == lc->link_ok && speed == lc->speed && + duplex == lc->duplex && fc == lc->fc) + return; /* nothing changed */ + if (link_ok != lc->link_ok && adapter->params.rev > 0 && uses_xaui(adapter)) { if (link_ok) @@ -1241,10 +1275,6 @@ void t3_link_changed(adapter_t *adapter, int port_id) lc->link_ok = (unsigned char)link_ok; lc->speed = speed < 0 ? SPEED_INVALID : speed; lc->duplex = duplex < 0 ? DUPLEX_INVALID : duplex; - if (lc->requested_fc & PAUSE_AUTONEG) - fc &= lc->requested_fc; - else - fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX); if (link_ok && speed >= 0 && lc->autoneg == AUTONEG_ENABLE) { /* Set MAC speed, duplex, and flow control to match PHY. */ @@ -1784,19 +1814,15 @@ static int mac_intr_handler(adapter_t *adap, unsigned int idx) */ int t3_phy_intr_handler(adapter_t *adapter) { - u32 mask, gpi = adapter_info(adapter)->gpio_intr; u32 i, cause = t3_read_reg(adapter, A_T3DBG_INT_CAUSE); for_each_port(adapter, i) { struct port_info *p = adap2pinfo(adapter, i); - mask = gpi - (gpi & (gpi - 1)); - gpi -= mask; - if (!(p->phy.caps & SUPPORTED_IRQ)) continue; - if (cause & mask) { + if (cause & (1 << adapter_info(adapter)->gpio_intr[i])) { int phy_cause = p->phy.ops->intr_handler(&p->phy); if (phy_cause & cphy_cause_link_change) @@ -1870,6 +1896,17 @@ int t3_slow_intr_handler(adapter_t *adapter) return 1; } +static unsigned int calc_gpio_intr(adapter_t *adap) +{ + unsigned int i, gpi_intr = 0; + + for_each_port(adap, i) + if ((adap2pinfo(adap, i)->phy.caps & SUPPORTED_IRQ) && + adapter_info(adap)->gpio_intr[i]) + gpi_intr |= 1 << adapter_info(adap)->gpio_intr[i]; + return gpi_intr; +} + /** * t3_intr_enable - enable interrupts * @adapter: the adapter whose interrupts should be enabled @@ -1912,10 +1949,8 @@ void t3_intr_enable(adapter_t *adapter) t3_write_reg(adapter, A_ULPTX_INT_ENABLE, ULPTX_INTR_MASK); } - t3_write_reg(adapter, A_T3DBG_GPIO_ACT_LOW, - adapter_info(adapter)->gpio_intr); - t3_write_reg(adapter, A_T3DBG_INT_ENABLE, - adapter_info(adapter)->gpio_intr); + t3_write_reg(adapter, A_T3DBG_INT_ENABLE, calc_gpio_intr(adapter)); + if (is_pcie(adapter)) t3_write_reg(adapter, A_PCIE_INT_ENABLE, PCIE_INTR_MASK); else @@ -2561,6 +2596,20 @@ static void tp_wr_bits_indirect(adapter_t *adap, unsigned int addr, } /** + * t3_enable_filters - enable the HW filters + * @adap: the adapter + * + * Enables the HW filters for NIC traffic. + */ +void t3_enable_filters(adapter_t *adap) +{ + t3_set_reg_field(adap, A_TP_IN_CONFIG, F_NICMODE, 0); + t3_set_reg_field(adap, A_MC5_DB_CONFIG, 0, F_FILTEREN); + t3_set_reg_field(adap, A_TP_GLOBAL_CONFIG, 0, V_FIVETUPLELOOKUP(3)); + tp_wr_bits_indirect(adap, A_TP_INGRESS_CONFIG, 0, F_LOOKUPEVERYPKT); +} + +/** * pm_num_pages - calculate the number of pages of the payload memory * @mem_size: the size of the payload memory * @pg_size: the size of each payload memory page @@ -2660,10 +2709,10 @@ static void tp_config(adapter_t *adap, const struct tp_params *p) F_TCPCHECKSUMOFFLOAD | V_IPTTL(64)); t3_write_reg(adap, A_TP_TCP_OPTIONS, V_MTUDEFAULT(576) | F_MTUENABLE | V_WINDOWSCALEMODE(1) | - V_TIMESTAMPSMODE(0) | V_SACKMODE(1) | V_SACKRX(1)); + V_TIMESTAMPSMODE(1) | V_SACKMODE(1) | V_SACKRX(1)); t3_write_reg(adap, A_TP_DACK_CONFIG, V_AUTOSTATE3(1) | V_AUTOSTATE2(1) | V_AUTOSTATE1(0) | - V_BYTETHRESHOLD(16384) | V_MSSTHRESHOLD(2) | + V_BYTETHRESHOLD(26880) | V_MSSTHRESHOLD(2) | F_AUTOCAREFUL | F_AUTOENABLE | V_DACK_MODE(1)); t3_set_reg_field(adap, A_TP_IN_CONFIG, F_RXFBARBPRIO | F_TXFBARBPRIO, F_IPV6ENABLE | F_NICMODE); @@ -2705,7 +2754,8 @@ static void tp_config(adapter_t *adap, const struct tp_params *p) if (adap->params.nports > 2) { t3_set_reg_field(adap, A_TP_PC_CONFIG2, 0, - F_ENABLETXPORTFROMDA | F_ENABLERXPORTFROMADDR); + F_ENABLETXPORTFROMDA2 | F_ENABLETXPORTFROMDA | + F_ENABLERXPORTFROMADDR); tp_wr_bits_indirect(adap, A_TP_QOS_RX_MAP_MODE, V_RXMAPMODE(M_RXMAPMODE), 0); tp_wr_indirect(adap, A_TP_INGRESS_CONFIG, V_BITPOS0(48) | @@ -3620,6 +3670,8 @@ int t3_init_hw(adapter_t *adapter, u32 fw_params) chan_init_hw(adapter, adapter->params.chan_map); t3_sge_init(adapter, &adapter->params.sge); + t3_write_reg(adapter, A_T3DBG_GPIO_ACT_LOW, calc_gpio_intr(adapter)); + t3_write_reg(adapter, A_CIM_HOST_ACC_DATA, vpd->uclk | fw_params); t3_write_reg(adapter, A_CIM_BOOT_CFG, V_BOOTADDR(FW_FLASH_BOOT_ADDR >> 2)); diff --git a/sys/dev/cxgb/common/cxgb_tn1010.c b/sys/dev/cxgb/common/cxgb_tn1010.c new file mode 100644 index 0000000000000..920ccc04a8665 --- /dev/null +++ b/sys/dev/cxgb/common/cxgb_tn1010.c @@ -0,0 +1,225 @@ +/************************************************************************** + +Copyright (c) 2008, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#else +#include <dev/cxgb/cxgb_include.h> +#endif + +#undef msleep +#define msleep t3_os_sleep + +/* TN1010 PHY specific registers. */ +enum { + TN1010_VEND1_STAT = 1, +}; + +/* IEEE auto-negotiation 10GBASE-T registers */ +enum { + ANEG_ADVER = 16, + ANEG_LPA = 19, + ANEG_10G_CTRL = 32, + ANEG_10G_STAT = 33 +}; + +#define ADVERTISE_ENPAGE (1 << 12) +#define ADVERTISE_10000FULL (1 << 12) +#define ADVERTISE_LOOP_TIMING (1 << 0) + +/* vendor specific status register fields */ +#define F_XS_LANE_ALIGN_STAT (1 << 0) +#define F_PCS_BLK_LOCK (1 << 1) +#define F_PMD_SIGNAL_OK (1 << 2) +#define F_LINK_STAT (1 << 3) +#define F_ANEG_SPEED_1G (1 << 4) +#define F_ANEG_MASTER (1 << 5) + +#define S_ANEG_STAT 6 +#define M_ANEG_STAT 0x3 +#define G_ANEG_STAT(x) (((x) >> S_ANEG_STAT) & M_ANEG_STAT) + +enum { /* autonegotiation status */ + ANEG_IN_PROGR = 0, + ANEG_COMPLETE = 1, + ANEG_FAILED = 3 +}; + +/* + * Reset the PHY. May take up to 500ms to complete. + */ +static int tn1010_reset(struct cphy *phy, int wait) +{ + int err = t3_phy_reset(phy, MDIO_DEV_PMA_PMD, wait); + msleep(500); + return err; +} + +static int tn1010_power_down(struct cphy *phy, int enable) +{ + return t3_mdio_change_bits(phy, MDIO_DEV_PMA_PMD, MII_BMCR, + BMCR_PDOWN, enable ? BMCR_PDOWN : 0); +} + +static int tn1010_autoneg_enable(struct cphy *phy) +{ + int err; + + err = tn1010_power_down(phy, 0); + if (!err) + err = t3_mdio_change_bits(phy, MDIO_DEV_ANEG, MII_BMCR, 0, + BMCR_ANENABLE | BMCR_ANRESTART); + return err; +} + +static int tn1010_autoneg_restart(struct cphy *phy) +{ + int err; + + err = tn1010_power_down(phy, 0); + if (!err) + err = t3_mdio_change_bits(phy, MDIO_DEV_ANEG, MII_BMCR, 0, + BMCR_ANRESTART); + return err; +} + +static int tn1010_advertise(struct cphy *phy, unsigned int advert) +{ + int err, val; + + if (!(advert & ADVERTISED_1000baseT_Full)) + return -EINVAL; /* PHY can't disable 1000BASE-T */ + + val = ADVERTISE_CSMA | ADVERTISE_ENPAGE | ADVERTISE_NPAGE; + if (advert & ADVERTISED_Pause) + val |= ADVERTISE_PAUSE_CAP; + if (advert & ADVERTISED_Asym_Pause) + val |= ADVERTISE_PAUSE_ASYM; + err = mdio_write(phy, MDIO_DEV_ANEG, ANEG_ADVER, val); + if (err) + return err; + + val = (advert & ADVERTISED_10000baseT_Full) ? ADVERTISE_10000FULL : 0; + return mdio_write(phy, MDIO_DEV_ANEG, ANEG_10G_CTRL, val | + ADVERTISE_LOOP_TIMING); +} + +static int tn1010_get_link_status(struct cphy *phy, int *link_ok, + int *speed, int *duplex, int *fc) +{ + unsigned int status, lpa, adv; + int err, sp = -1, pause = 0; + + err = mdio_read(phy, MDIO_DEV_VEND1, TN1010_VEND1_STAT, &status); + if (err) + return err; + + if (link_ok) + *link_ok = (status & F_LINK_STAT) != 0; + + if (G_ANEG_STAT(status) == ANEG_COMPLETE) { + sp = (status & F_ANEG_SPEED_1G) ? SPEED_1000 : SPEED_10000; + + if (fc) { + err = mdio_read(phy, MDIO_DEV_ANEG, ANEG_LPA, &lpa); + if (!err) + err = mdio_read(phy, MDIO_DEV_ANEG, ANEG_ADVER, + &adv); + if (err) + return err; + + if (lpa & adv & ADVERTISE_PAUSE_CAP) + pause = PAUSE_RX | PAUSE_TX; + else if ((lpa & ADVERTISE_PAUSE_CAP) && + (lpa & ADVERTISE_PAUSE_ASYM) && + (adv & ADVERTISE_PAUSE_ASYM)) + pause = PAUSE_TX; + else if ((lpa & ADVERTISE_PAUSE_ASYM) && + (adv & ADVERTISE_PAUSE_CAP)) + pause = PAUSE_RX; + } + } + if (speed) + *speed = sp; + if (duplex) + *duplex = DUPLEX_FULL; + if (fc) + *fc = pause; + return 0; +} + +static int tn1010_set_speed_duplex(struct cphy *phy, int speed, int duplex) +{ + return -EINVAL; /* require autoneg */ +} + +#ifdef C99_NOT_SUPPORTED +static struct cphy_ops tn1010_ops = { + tn1010_reset, + t3_phy_lasi_intr_enable, + t3_phy_lasi_intr_disable, + t3_phy_lasi_intr_clear, + t3_phy_lasi_intr_handler, + tn1010_autoneg_enable, + tn1010_autoneg_restart, + tn1010_advertise, + NULL, + tn1010_set_speed_duplex, + tn1010_get_link_status, + tn1010_power_down, +}; +#else +static struct cphy_ops tn1010_ops = { + .reset = tn1010_reset, + .intr_enable = t3_phy_lasi_intr_enable, + .intr_disable = t3_phy_lasi_intr_disable, + .intr_clear = t3_phy_lasi_intr_clear, + .intr_handler = t3_phy_lasi_intr_handler, + .autoneg_enable = tn1010_autoneg_enable, + .autoneg_restart = tn1010_autoneg_restart, + .advertise = tn1010_advertise, + .set_speed_duplex = tn1010_set_speed_duplex, + .get_link_status = tn1010_get_link_status, + .power_down = tn1010_power_down, +}; +#endif + +int t3_tn1010_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, + const struct mdio_ops *mdio_ops) +{ + cphy_init(phy, adapter, phy_addr, &tn1010_ops, mdio_ops, + SUPPORTED_1000baseT_Full | SUPPORTED_10000baseT_Full | + SUPPORTED_Autoneg | SUPPORTED_AUI | SUPPORTED_TP, + "1000/10GBASE-T"); + msleep(500); /* PHY needs up to 500ms to start responding to MDIO */ + return 0; +} diff --git a/sys/dev/cxgb/common/cxgb_vsc8211.c b/sys/dev/cxgb/common/cxgb_vsc8211.c index 61bdc9c7f5ed2..ad3c88e4c99d3 100644 --- a/sys/dev/cxgb/common/cxgb_vsc8211.c +++ b/sys/dev/cxgb/common/cxgb_vsc8211.c @@ -45,6 +45,7 @@ enum { VSC8211_EXT_CTRL = 23, VSC8211_INTR_ENABLE = 25, VSC8211_INTR_STATUS = 26, + VSC8211_LED_CTRL = 27, VSC8211_AUX_CTRL_STAT = 28, VSC8211_EXT_PAGE_AXS = 31, }; @@ -393,8 +394,10 @@ int t3_vsc8211_phy_prep(struct cphy *phy, adapter_t *adapter, int phy_addr, err = mdio_read(phy, 0, VSC8211_EXT_CTRL, &val); if (err) return err; - if (val & VSC_CTRL_MEDIA_MODE_HI) - return 0; /* copper interface, done */ + if (val & VSC_CTRL_MEDIA_MODE_HI) { + /* copper interface, just need to configure the LEDs */ + return mdio_write(phy, 0, VSC8211_LED_CTRL, 0x100); + } phy->caps = SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg | SUPPORTED_MII | SUPPORTED_FIBRE | SUPPORTED_IRQ; diff --git a/sys/dev/cxgb/common/cxgb_xgmac.c b/sys/dev/cxgb/common/cxgb_xgmac.c index 745cc4b4dd5e2..51a02c25bcb69 100644 --- a/sys/dev/cxgb/common/cxgb_xgmac.c +++ b/sys/dev/cxgb/common/cxgb_xgmac.c @@ -44,7 +44,6 @@ __FBSDID("$FreeBSD$"); * # of exact address filters. The first one is used for the station address, * the rest are available for multicast addresses. */ -#define EXACT_ADDR_FILTERS 8 static inline int macidx(const struct cmac *mac) { @@ -159,16 +158,18 @@ int t3_mac_reset(struct cmac *mac) t3_write_reg(adap, A_XGM_TX_CTRL + oft, F_TXEN); t3_write_reg(adap, A_XGM_RX_CTRL + oft, F_RXEN); } + t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + oft, V_RXMAXFRAMERSIZE(M_RXMAXFRAMERSIZE), V_RXMAXFRAMERSIZE(MAX_FRAME_SIZE) | F_RXENFRAMER); + val = F_MAC_RESET_ | F_XGMAC_STOP_EN; - if (is_10G(adap) || mac->multiport) + if (!mac->multiport) + val |= F_XG2G_RESET_; + if (uses_xaui(adap)) val |= F_PCS_RESET_; - else if (uses_xaui(adap)) - val |= F_PCS_RESET_ | F_XG2G_RESET_; else - val |= F_RGMII_RESET_ | F_XG2G_RESET_; + val |= F_RGMII_RESET_; t3_write_reg(adap, A_XGM_RESET_CTRL + oft, val); (void) t3_read_reg(adap, A_XGM_RESET_CTRL + oft); /* flush */ if ((val & F_PCS_RESET_) && adap->params.rev) { @@ -188,10 +189,10 @@ static int t3b2_mac_reset(struct cmac *mac) /* Stop egress traffic to xgm*/ - if (!macidx(mac)) - t3_set_reg_field(adap, A_MPS_CFG, F_PORT0ACTIVE, 0); + if (!macidx(mac)) + t3_set_reg_field(adap, A_MPS_CFG, F_PORT0ACTIVE, 0); else - t3_set_reg_field(adap, A_MPS_CFG, F_PORT1ACTIVE, 0); + t3_set_reg_field(adap, A_MPS_CFG, F_PORT1ACTIVE, 0); /* PCS in reset */ t3_write_reg(adap, A_XGM_RESET_CTRL + oft, F_MAC_RESET_); @@ -223,15 +224,15 @@ static int t3b2_mac_reset(struct cmac *mac) msleep(1); t3b_pcs_reset(mac); } - t3_write_reg(adap, A_XGM_RX_CFG + oft, + t3_write_reg(adap, A_XGM_RX_CFG + oft, F_DISPAUSEFRAMES | F_EN1536BFRAMES | F_RMFCS | F_ENJUMBO | F_ENHASHMCAST ); /*Resume egress traffic to xgm*/ - if (!macidx(mac)) - t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT0ACTIVE); + if (!macidx(mac)) + t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT0ACTIVE); else - t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT1ACTIVE); + t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT1ACTIVE); return 0; } @@ -279,6 +280,9 @@ int t3_mac_set_address(struct cmac *mac, unsigned int idx, u8 addr[6]) * Specify the number of exact address filters that should be reserved for * unicast addresses. Caller should reload the unicast and multicast * addresses after calling this. + * + * Generally, this is 1 with the first one used for the station address, + * and the rest are available for multicast addresses. */ int t3_mac_set_num_ucast(struct cmac *mac, unsigned char n) { @@ -385,7 +389,7 @@ static int rx_fifo_hwm(int mtu) * * Sets the MAC MTU and adjusts the FIFO PAUSE watermarks accordingly. */ -int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu) +int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu) { int hwm, lwm, divisor; int ipg; @@ -413,7 +417,7 @@ int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu) reg = adap->params.rev == T3_REV_B2 ? A_XGM_RX_MAX_PKT_SIZE_ERR_CNT : A_XGM_RXFIFO_CFG; - + /* drain RX FIFO */ if (t3_wait_op_done(adap, reg + mac->offset, F_RXFIFO_EMPTY, 1, 20, 5)) { @@ -428,9 +432,8 @@ int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu) enable_exact_filters(mac); } else t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset, - V_RXMAXPKTSIZE(M_RXMAXPKTSIZE), - V_RXMAXPKTSIZE(mtu)); - + V_RXMAXPKTSIZE(M_RXMAXPKTSIZE), + V_RXMAXPKTSIZE(mtu)); /* * Adjust the PAUSE frame watermarks. We always set the LWM, and the * HWM only if flow-control is enabled. @@ -462,10 +465,10 @@ int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu) */ if (adap->params.rev > 0) { divisor = (adap->params.rev == T3_REV_C) ? 64 : 8; - t3_write_reg(adap, A_XGM_PAUSE_TIMER + mac->offset, - (hwm - lwm) * 4 / divisor); + t3_write_reg(adap, A_XGM_PAUSE_TIMER + mac->offset, + (hwm - lwm) * 4 / divisor); } - t3_write_reg(adap, A_XGM_TX_PAUSE_QUANTA + mac->offset, + t3_write_reg(adap, A_XGM_TX_PAUSE_QUANTA + mac->offset, MAC_RXFIFO_SIZE * 4 * 8 / 512); return 0; } @@ -489,7 +492,7 @@ int t3_mac_set_speed_duplex_fc(struct cmac *mac, int speed, int duplex, int fc) if (duplex >= 0 && duplex != DUPLEX_FULL) return -EINVAL; - if (mac->multiport) { + if (mac->multiport) { val = t3_read_reg(adap, A_XGM_RXFIFO_CFG + oft); val &= ~V_RXFIFOPAUSEHWM(M_RXFIFOPAUSEHWM); val |= V_RXFIFOPAUSEHWM(rx_fifo_hwm(t3_read_reg(adap, @@ -575,7 +578,7 @@ int t3_mac_enable(struct cmac *mac, int which) mac->txen = F_TXEN; mac->toggle_cnt = 0; } - if (which & MAC_DIRECTION_RX) + if (which & MAC_DIRECTION_RX) t3_write_reg(adap, A_XGM_RX_CTRL + oft, F_RXEN); return 0; } @@ -673,10 +676,10 @@ rxcheck: if (rx_mcnt != mac->rx_mcnt) { rx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap, A_XGM_RX_SPI4_SOP_EOP_CNT + - mac->offset))) + + mac->offset))) + (s->rx_fifo_ovfl - mac->rx_ocnt); mac->rx_ocnt = s->rx_fifo_ovfl; - } else + } else goto out; if (mac->rx_mcnt != s->rx_frames && rx_xcnt == 0 && mac->rx_xcnt == 0) { @@ -684,8 +687,8 @@ rxcheck: status = 2; goto out; } - -out: + +out: mac->tx_tcnt = tx_tcnt; mac->tx_xcnt = tx_xcnt; mac->tx_mcnt = s->tx_frames; diff --git a/sys/dev/cxgb/cxgb_adapter.h b/sys/dev/cxgb/cxgb_adapter.h index f2b0531503910..39fe8eb91f58a 100644 --- a/sys/dev/cxgb/cxgb_adapter.h +++ b/sys/dev/cxgb/cxgb_adapter.h @@ -166,7 +166,7 @@ enum { TXQ_ETH = 0, * work request size in bytes */ #define WR_LEN (WR_FLITS * 8) -#define PIO_LEN (WR_LEN - sizeof(struct cpl_tx_pkt)) +#define PIO_LEN (WR_LEN - sizeof(struct cpl_tx_pkt_lso)) /* careful, the following are set on priv_flags and must not collide with diff --git a/sys/dev/cxgb/cxgb_config.h b/sys/dev/cxgb/cxgb_config.h index 723c23e7279f8..a5ee963b4734f 100644 --- a/sys/dev/cxgb/cxgb_config.h +++ b/sys/dev/cxgb/cxgb_config.h @@ -31,7 +31,6 @@ $FreeBSD$ ***************************************************************************/ #ifndef _CXGB_CONFIG_H_ #define _CXGB_CONFIG_H_ -#define DISABLE_MBUF_IOVEC #define RTALLOC2_DEFINED #define VM_FAULT_HOLD_DEFINED #ifndef CONFIG_DEFINED diff --git a/sys/dev/cxgb/cxgb_main.c b/sys/dev/cxgb/cxgb_main.c index f6cfcdfbe46e6..4fb53b53efe70 100644 --- a/sys/dev/cxgb/cxgb_main.c +++ b/sys/dev/cxgb/cxgb_main.c @@ -9,7 +9,7 @@ modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -2. Neither the name of the Chelsio Corporation nor the names of its + 2. Neither the name of the Chelsio Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. @@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$"); #include <net/if_dl.h> #include <net/if_media.h> #include <net/if_types.h> +#include <net/if_vlan_var.h> #include <netinet/in_systm.h> #include <netinet/in.h> @@ -724,10 +725,9 @@ cxgb_free(struct adapter *sc) } else printf("not offloading set\n"); #ifdef notyet - /* XXX need to handle unload in TOM */ if (sc->flags & CXGB_OFLD_INIT) cxgb_offload_deactivate(sc); -#endif +#endif free(sc->filters, M_DEVBUF); t3_sge_free(sc); @@ -979,7 +979,7 @@ cxgb_port_attach(device_t dev) * Only default to jumbo frames on 10GigE */ if (p->adapter->params.nports <= 2) - ifp->if_mtu = 9000; + ifp->if_mtu = ETHERMTU_JUMBO; if ((err = cxgb_makedev(p)) != 0) { printf("makedev failed %d\n", err); return (err); @@ -1255,13 +1255,23 @@ cxgb_link_start(struct port_info *p) struct ifnet *ifp; struct t3_rx_mode rm; struct cmac *mac = &p->mac; + int mtu, hwtagging; ifp = p->ifp; + bcopy(IF_LLADDR(ifp), p->hw_addr, ETHER_ADDR_LEN); + + mtu = ifp->if_mtu; + if (ifp->if_capenable & IFCAP_VLAN_MTU) + mtu += ETHER_VLAN_ENCAP_LEN; + + hwtagging = (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0; + t3_init_rx_mode(&rm, p); if (!mac->multiport) t3_mac_reset(mac); - t3_mac_set_mtu(mac, ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + t3_mac_set_mtu(mac, mtu); + t3_set_vlan_accel(p->adapter, 1 << p->tx_chan, hwtagging); t3_mac_set_address(mac, 0, p->hw_addr); t3_mac_set_rx_mode(mac, &rm); t3_link_start(&p->phy, mac, &p->link_config); @@ -1751,10 +1761,9 @@ offload_open(struct port_info *pi) adapter->params.rev == 0 ? adapter->port[0].ifp->if_mtu : 0xffff); init_smt(adapter); -#ifdef TOE_ENABLED /* Call back all registered clients */ cxgb_add_clients(tdev); -#endif + /* restore them in case the offload module has changed them */ if (err) { t3_tp_set_offload_mode(adapter, 0); @@ -1771,10 +1780,10 @@ offload_close(struct t3cdev *tdev) if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT)) return (0); -#ifdef TOE_ENABLED + /* Call back all registered clients */ cxgb_remove_clients(tdev); -#endif + tdev->lldev = NULL; cxgb_set_dummy_ops(tdev); t3_tp_set_offload_mode(adapter, 0); @@ -1904,7 +1913,7 @@ cxgb_set_mtu(struct port_info *p, int mtu) struct ifnet *ifp = p->ifp; int error = 0; - if ((mtu < ETHERMIN) || (mtu > ETHER_MAX_LEN_JUMBO)) + if ((mtu < ETHERMIN) || (mtu > ETHERMTU_JUMBO)) error = EINVAL; else if (ifp->if_mtu != mtu) { PORT_LOCK(p); @@ -1924,7 +1933,7 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data) struct port_info *p = ifp->if_softc; struct ifaddr *ifa = (struct ifaddr *)data; struct ifreq *ifr = (struct ifreq *)data; - int flags, error = 0; + int flags, error = 0, reinit = 0; uint32_t mask; /* @@ -1979,18 +1988,16 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data) if (IFCAP_TXCSUM & ifp->if_capenable) { ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4); ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP - | CSUM_TSO); + | CSUM_IP | CSUM_TSO); } else { ifp->if_capenable |= IFCAP_TXCSUM; - ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP); - } - } else if (mask & IFCAP_RXCSUM) { - if (IFCAP_RXCSUM & ifp->if_capenable) { - ifp->if_capenable &= ~IFCAP_RXCSUM; - } else { - ifp->if_capenable |= IFCAP_RXCSUM; + ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP + | CSUM_IP); } } + if (mask & IFCAP_RXCSUM) { + ifp->if_capenable ^= IFCAP_RXCSUM; + } if (mask & IFCAP_TSO4) { if (IFCAP_TSO4 & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_TSO4; @@ -2005,7 +2012,26 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data) error = EINVAL; } } + if (mask & IFCAP_VLAN_HWTAGGING) { + ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; + reinit = ifp->if_drv_flags & IFF_DRV_RUNNING; + } + if (mask & IFCAP_VLAN_MTU) { + ifp->if_capenable ^= IFCAP_VLAN_MTU; + reinit = ifp->if_drv_flags & IFF_DRV_RUNNING; + } + if (mask & IFCAP_VLAN_HWCSUM) { + ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; + } + if (reinit) { + cxgb_stop_locked(p); + cxgb_init_locked(p); + } PORT_UNLOCK(p); + +#ifdef VLAN_CAPABILITIES + VLAN_CAPABILITIES(ifp); +#endif break; default: error = ether_ioctl(ifp, command, data); @@ -2126,9 +2152,11 @@ check_t3b2_mac(struct adapter *adapter) p->mac.stats.num_toggled++; else if (status == 2) { struct cmac *mac = &p->mac; + int mtu = ifp->if_mtu; - t3_mac_set_mtu(mac, ifp->if_mtu + ETHER_HDR_LEN - + ETHER_VLAN_ENCAP_LEN); + if (ifp->if_capenable & IFCAP_VLAN_MTU) + mtu += ETHER_VLAN_ENCAP_LEN; + t3_mac_set_mtu(mac, mtu); t3_mac_set_address(mac, 0, p->hw_addr); cxgb_set_rxmode(p); t3_link_start(&p->phy, mac, &p->link_config); @@ -2434,7 +2462,7 @@ cxgb_extension_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, if (t->intr_lat >= 0) { struct sge_qset *qs = &sc->sge.qs[t->qset_idx]; - q->coalesce_nsecs = t->intr_lat*1000; + q->coalesce_usecs = t->intr_lat; t3_update_qset_coalesce(qs, q); } break; @@ -2454,7 +2482,7 @@ cxgb_extension_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, t->fl_size[0] = q->fl_size; t->fl_size[1] = q->jumbo_size; t->polling = q->polling; - t->intr_lat = q->coalesce_nsecs / 1000; + t->intr_lat = q->coalesce_usecs; t->cong_thres = q->cong_thres; break; } diff --git a/sys/dev/cxgb/cxgb_offload.c b/sys/dev/cxgb/cxgb_offload.c index 1eeafafa5b4f5..d865e7f7cbfb1 100644 --- a/sys/dev/cxgb/cxgb_offload.c +++ b/sys/dev/cxgb/cxgb_offload.c @@ -1,7 +1,6 @@ - /************************************************************************** -Copyright (c) 2007, Chelsio Inc. +Copyright (c) 2007-2008, Chelsio Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -104,7 +103,7 @@ unregister_tdev(struct t3cdev *tdev) mtx_unlock(&cxgb_db_lock); } -#ifdef TOE_ENABLED +#ifndef TCP_OFFLOAD_DISABLE /** * cxgb_register_client - register an offload client * @client: the client diff --git a/sys/dev/cxgb/cxgb_offload.h b/sys/dev/cxgb/cxgb_offload.h index dbe2bc50a4cd8..605dd0b0dc2a8 100644 --- a/sys/dev/cxgb/cxgb_offload.h +++ b/sys/dev/cxgb/cxgb_offload.h @@ -36,17 +36,13 @@ $FreeBSD$ #ifdef CONFIG_DEFINED #include <common/cxgb_version.h> #include <cxgb_config.h> -#ifdef TOE_ENABLED #include <ulp/tom/cxgb_l2t.h> -#endif #include <common/cxgb_tcb.h> #include <t3cdev.h> #else #include <dev/cxgb/common/cxgb_version.h> #include <dev/cxgb/cxgb_config.h> -#ifdef TOE_ENABLED #include <dev/cxgb/ulp/tom/cxgb_l2t.h> -#endif #include <dev/cxgb/common/cxgb_tcb.h> #include <dev/cxgb/t3cdev.h> #endif @@ -83,7 +79,6 @@ void cxgb_remove_clients(struct t3cdev *tdev); typedef int (*cxgb_cpl_handler_func)(struct t3cdev *dev, struct mbuf *m, void *ctx); -#ifdef TOE_ENABLED struct cxgb_client { char *name; void (*add) (struct t3cdev *); @@ -102,7 +97,6 @@ int cxgb_alloc_atid(struct t3cdev *dev, struct cxgb_client *client, void *ctx); int cxgb_alloc_stid(struct t3cdev *dev, struct cxgb_client *client, void *ctx); -#endif void *cxgb_free_atid(struct t3cdev *dev, int atid); void cxgb_free_stid(struct t3cdev *dev, int stid); void *cxgb_get_lctx(struct t3cdev *tdev, int stid); diff --git a/sys/dev/cxgb/cxgb_osdep.h b/sys/dev/cxgb/cxgb_osdep.h index 7466d8a24be5e..73d7c77ae3cee 100644 --- a/sys/dev/cxgb/cxgb_osdep.h +++ b/sys/dev/cxgb/cxgb_osdep.h @@ -55,12 +55,25 @@ $FreeBSD$ typedef struct adapter adapter_t; struct sge_rspq; +enum { + TP_TMR_RES = 200, /* TP timer resolution in usec */ + MAX_NPORTS = 4, /* max # of ports */ + TP_SRAM_OFFSET = 4096, /* TP SRAM content offset in eeprom */ + TP_SRAM_LEN = 2112, /* TP SRAM content offset in eeprom */ +}; struct t3_mbuf_hdr { struct mbuf *mh_head; struct mbuf *mh_tail; }; +#ifndef PANIC_IF +#define PANIC_IF(exp) do { \ + if (exp) \ + panic("BUG: %s", #exp); \ +} while (0) +#endif + #define m_get_priority(m) ((uintptr_t)(m)->m_pkthdr.rcvif) #define m_set_priority(m, pri) ((m)->m_pkthdr.rcvif = (struct ifnet *)((uintptr_t)pri)) #define m_set_sgl(m, sgl) ((m)->m_pkthdr.header = (sgl)) @@ -127,9 +140,6 @@ void cxgb_log_tcb(struct adapter *sc, unsigned int tid); #define TX_START_MIN_DESC (TX_MAX_DESC << 2) - - - #define TX_START_MAX_DESC (TX_MAX_DESC << 3) /* maximum number of descriptors * call to start used per */ @@ -159,7 +169,7 @@ void prefetch(void *x) extern void kdb_backtrace(void); #define WARN_ON(condition) do { \ - if ((condition)!=0) { \ + if (__predict_false((condition)!=0)) { \ log(LOG_WARNING, "BUG: warning at %s:%d/%s()\n", __FILE__, __LINE__, __FUNCTION__); \ kdb_backtrace(); \ } \ @@ -384,6 +394,9 @@ static const int debug_flags = DBG_RX; #define ADVERTISE_1000XPSE_ASYM ANAR_X_PAUSE_ASYM #define ADVERTISE_1000XPAUSE ANAR_X_PAUSE_SYM +#define ADVERTISE_CSMA ANAR_CSMA +#define ADVERTISE_NPAGE ANAR_NP + /* Standard PCI Extended Capaibilities definitions */ #define PCI_CAP_ID_VPD 0x03 diff --git a/sys/dev/cxgb/cxgb_sge.c b/sys/dev/cxgb/cxgb_sge.c index 7f9c933854d05..50335aa17bb0f 100644 --- a/sys/dev/cxgb/cxgb_sge.c +++ b/sys/dev/cxgb/cxgb_sge.c @@ -394,12 +394,12 @@ t3_sge_prep(adapter_t *adap, struct sge_params *p) struct qset_params *q = p->qset + i; if (adap->params.nports > 2) { - q->coalesce_nsecs = 50000; + q->coalesce_usecs = 50; } else { #ifdef INVARIANTS - q->coalesce_nsecs = 10000; + q->coalesce_usecs = 10; #else - q->coalesce_nsecs = 5000; + q->coalesce_usecs = 5; #endif } q->polling = adap->params.rev > 0; @@ -490,7 +490,7 @@ void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p) { - qs->rspq.holdoff_tmr = max(p->coalesce_nsecs/100, 1U); + qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U); qs->rspq.polling = 0 /* p->polling */; } @@ -1314,6 +1314,10 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count) cntrl = V_TXPKT_INTF(pi->txpkt_intf); GET_VTAG_MI(cntrl, batchmi); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); + if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP))) + cntrl |= F_TXPKT_IPCSUM_DIS; + if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP)))) + cntrl |= F_TXPKT_L4CSUM_DIS; cbe->cntrl = htonl(cntrl); cbe->len = htonl(batchmi->mi_len | 0x80000000); cbe->addr = htobe64(segs[i].ds_addr); @@ -1343,7 +1347,7 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count) tmpmi = mv->mv_vec; txd->flit[2] = 0; - GET_VTAG_MI(cntrl, mi); + GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO); hdr->cntrl = htonl(cntrl); mlen = m0->m_pkthdr.len; @@ -1356,7 +1360,10 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count) if (__predict_false(undersized)) { pkthdr = tmp; - dump_mi(mi); + if (mi) + dump_mi(mi); + printf("mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x", + m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags); panic("discontig packet - fixxorz"); } else pkthdr = m0->m_data; @@ -1376,12 +1383,39 @@ t3_encap(struct sge_qset *qs, struct mbuf **m, int count) V_LSO_IPHDR_WORDS(ip->ip_hl) | V_LSO_TCPHDR_WORDS(tcp->th_off); hdr->lso_info = htonl(tso_info); + + if (__predict_false(mlen <= PIO_LEN)) { + /* pkt not undersized but fits in PIO_LEN + * Indicates a TSO bug at the higher levels. + */ + DPRINTF("**5592 Fix** mbuf=%p,len=%d,tso_segsz=%d,csum_flags=%#x,flags=%#x", + m0, mlen, m0->m_pkthdr.tso_segsz, m0->m_pkthdr.csum_flags, m0->m_flags); + txq_prod(txq, 1, &txqs); + m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[3]); + m_freem(m0); + m0 = NULL; + flits = (mlen + 7) / 8 + 3; + hdr->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) | + V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | + F_WR_SOP | F_WR_EOP | txqs.compl); + wmb(); + hdr->wr.wr_lo = htonl(V_WR_LEN(flits) | + V_WR_GEN(txqs.gen) | V_WR_TID(txq->token)); + + wr_gen2(txd, txqs.gen); + check_ring_tx_db(sc, txq); + return (0); + } flits = 3; } else { struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd; GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); + if (__predict_false(!(m0->m_pkthdr.csum_flags & CSUM_IP))) + cntrl |= F_TXPKT_IPCSUM_DIS; + if (__predict_false(!(m0->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP)))) + cntrl |= F_TXPKT_L4CSUM_DIS; cpl->cntrl = htonl(cntrl); mlen = m0->m_pkthdr.len; cpl->len = htonl(mlen | 0x80000000); @@ -3223,11 +3257,11 @@ t3_lro_enable(SYSCTL_HANDLER_ARGS) } static int -t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS) +t3_set_coalesce_usecs(SYSCTL_HANDLER_ARGS) { adapter_t *sc = arg1; struct qset_params *qsp = &sc->params.sge.qset[0]; - int coalesce_nsecs; + int coalesce_usecs; struct sge_qset *qs; int i, j, err, nqsets = 0; struct mtx *lock; @@ -3235,25 +3269,25 @@ t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS) if ((sc->flags & FULL_INIT_DONE) == 0) return (ENXIO); - coalesce_nsecs = qsp->coalesce_nsecs; - err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req); + coalesce_usecs = qsp->coalesce_usecs; + err = sysctl_handle_int(oidp, &coalesce_usecs, arg2, req); if (err != 0) { return (err); } - if (coalesce_nsecs == qsp->coalesce_nsecs) + if (coalesce_usecs == qsp->coalesce_usecs) return (0); for (i = 0; i < sc->params.nports; i++) for (j = 0; j < sc->port[i].nqsets; j++) nqsets++; - coalesce_nsecs = max(100, coalesce_nsecs); + coalesce_usecs = max(1, coalesce_usecs); for (i = 0; i < nqsets; i++) { qs = &sc->sge.qs[i]; qsp = &sc->params.sge.qset[i]; - qsp->coalesce_nsecs = coalesce_nsecs; + qsp->coalesce_usecs = coalesce_usecs; lock = (sc->flags & USING_MSIX) ? &qs->rspq.lock : &sc->sge.qs[0].rspq.lock; @@ -3356,8 +3390,8 @@ t3_add_configured_sysctls(adapter_t *sc) SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal", CTLTYPE_INT|CTLFLAG_RW, sc, - 0, t3_set_coalesce_nsecs, - "I", "interrupt coalescing timer (ns)"); + 0, t3_set_coalesce_usecs, + "I", "interrupt coalescing timer (us)"); for (i = 0; i < sc->params.nports; i++) { struct port_info *pi = &sc->port[i]; diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c new file mode 100644 index 0000000000000..b198904533465 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c @@ -0,0 +1,294 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/eventhandler.h> + +#include <net/if.h> +#include <net/if_var.h> + +#include <netinet/in.h> + +#include <contrib/rdma/ib_verbs.h> + + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#endif + +/* + * XXX :-/ + * + */ + +#define idr_init(x) + +cxgb_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; + +static void open_rnic_dev(struct t3cdev *); +static void close_rnic_dev(struct t3cdev *); + +static TAILQ_HEAD( ,iwch_dev) dev_list; +static struct mtx dev_mutex; +static eventhandler_tag event_tag; + +static void +rnic_init(struct iwch_dev *rnicp) +{ + CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, rnicp); + idr_init(&rnicp->cqidr); + idr_init(&rnicp->qpidr); + idr_init(&rnicp->mmidr); + mtx_init(&rnicp->lock, "iwch rnic lock", NULL, MTX_DEF|MTX_DUPOK); + + rnicp->attr.vendor_id = 0x168; + rnicp->attr.vendor_part_id = 7; + rnicp->attr.max_qps = T3_MAX_NUM_QP - 32; + rnicp->attr.max_wrs = (1UL << 24) - 1; + rnicp->attr.max_sge_per_wr = T3_MAX_SGE; + rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE; + rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1; + rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1; + rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev); + rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE; + rnicp->attr.max_pds = T3_MAX_NUM_PD - 1; + rnicp->attr.mem_pgsizes_bitmask = 0x7FFF; /* 4KB-128MB */ + rnicp->attr.can_resize_wq = 0; + rnicp->attr.max_rdma_reads_per_qp = 8; + rnicp->attr.max_rdma_read_resources = + rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps; + rnicp->attr.max_rdma_read_qp_depth = 8; /* IRD */ + rnicp->attr.max_rdma_read_depth = + rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps; + rnicp->attr.rq_overflow_handled = 0; + rnicp->attr.can_modify_ird = 0; + rnicp->attr.can_modify_ord = 0; + rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1; + rnicp->attr.stag0_value = 1; + rnicp->attr.zbva_support = 1; + rnicp->attr.local_invalidate_fence = 1; + rnicp->attr.cq_overflow_detection = 1; + return; +} + +static void +open_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *rnicp; + static int vers_printed; + + CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__, tdev); + if (!vers_printed++) + printf("Chelsio T3 RDMA Driver - version %s\n", + DRV_VERSION); + rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); + if (!rnicp) { + printf("Cannot allocate ib device\n"); + return; + } + rnicp->rdev.ulp = rnicp; + rnicp->rdev.t3cdev_p = tdev; + + mtx_lock(&dev_mutex); + + if (cxio_rdev_open(&rnicp->rdev)) { + mtx_unlock(&dev_mutex); + printf("Unable to open CXIO rdev\n"); + ib_dealloc_device(&rnicp->ibdev); + return; + } + + rnic_init(rnicp); + + TAILQ_INSERT_TAIL(&dev_list, rnicp, entry); + mtx_unlock(&dev_mutex); + + if (iwch_register_device(rnicp)) { + printf("Unable to register device\n"); + close_rnic_dev(tdev); + } +#ifdef notyet + printf("Initialized device %s\n", + pci_name(rnicp->rdev.rnic_info.pdev)); +#endif + return; +} + +static void +close_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *dev, *tmp; + CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__, tdev); + mtx_lock(&dev_mutex); + + TAILQ_FOREACH_SAFE(dev, &dev_list, entry, tmp) { + if (dev->rdev.t3cdev_p == tdev) { +#ifdef notyet + list_del(&dev->entry); + iwch_unregister_device(dev); + cxio_rdev_close(&dev->rdev); + idr_destroy(&dev->cqidr); + idr_destroy(&dev->qpidr); + idr_destroy(&dev->mmidr); + ib_dealloc_device(&dev->ibdev); +#endif + break; + } + } + mtx_unlock(&dev_mutex); +} + +static ifaddr_event_handler_t +ifaddr_event_handler(void *arg, struct ifnet *ifp) +{ + printf("%s if name %s \n", __FUNCTION__, ifp->if_xname); + if (ifp->if_capabilities & IFCAP_TOE4) { + KASSERT(T3CDEV(ifp) != NULL, ("null t3cdev ptr!")); + if (cxio_hal_find_rdev_by_t3cdev(T3CDEV(ifp)) == NULL) + open_rnic_dev(T3CDEV(ifp)); + } + return 0; +} + + +static int +iwch_init_module(void) +{ + int err; + struct ifnet *ifp; + + printf("%s enter\n", __FUNCTION__); + TAILQ_INIT(&dev_list); + mtx_init(&dev_mutex, "iwch dev_list lock", NULL, MTX_DEF); + + err = cxio_hal_init(); + if (err) + return err; + err = iwch_cm_init(); + if (err) + return err; + cxio_register_ev_cb(iwch_ev_dispatch); + + /* Register for ifaddr events to dynamically add TOE devs */ + event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_event_handler, + NULL, EVENTHANDLER_PRI_ANY); + + /* Register existing TOE interfaces by walking the ifnet chain */ + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &ifnet, if_link) { + (void)ifaddr_event_handler(NULL, ifp); + } + IFNET_RUNLOCK(); + return 0; +} + +static void +iwch_exit_module(void) +{ + EVENTHANDLER_DEREGISTER(ifaddr_event, event_tag); + cxio_unregister_ev_cb(iwch_ev_dispatch); + iwch_cm_term(); + cxio_hal_exit(); +} + +static int +iwch_load(module_t mod, int cmd, void *arg) +{ + int err = 0; + + switch (cmd) { + case MOD_LOAD: + printf("Loading iw_cxgb.\n"); + + iwch_init_module(); + break; + case MOD_QUIESCE: + break; + case MOD_UNLOAD: + printf("Unloading iw_cxgb.\n"); + iwch_exit_module(); + break; + case MOD_SHUTDOWN: + break; + default: + err = EOPNOTSUPP; + break; + } + + return (err); +} + +static moduledata_t mod_data = { + "iw_cxgb", + iwch_load, + 0 +}; + +MODULE_VERSION(iw_cxgb, 1); +DECLARE_MODULE(iw_cxgb, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); +MODULE_DEPEND(iw_cxgb, rdma_core, 1, 1, 1); +MODULE_DEPEND(iw_cxgb, if_cxgb, 1, 1, 1); +MODULE_DEPEND(iw_cxgb, t3_tom, 1, 1, 1); + diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h new file mode 100644 index 0000000000000..f4b28566ebf5b --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.h @@ -0,0 +1,168 @@ +/************************************************************************** + +Copyright (c) 2007, 2008 Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ + +#ifndef __IWCH_H__ +#define __IWCH_H__ + +struct iwch_pd; +struct iwch_cq; +struct iwch_qp; +struct iwch_mr; + + +struct iwch_rnic_attributes { + u32 vendor_id; + u32 vendor_part_id; + u32 max_qps; + u32 max_wrs; /* Max for any SQ/RQ */ + u32 max_sge_per_wr; + u32 max_sge_per_rdma_write_wr; /* for RDMA Write WR */ + u32 max_cqs; + u32 max_cqes_per_cq; + u32 max_mem_regs; + u32 max_phys_buf_entries; /* for phys buf list */ + u32 max_pds; + + /* + * The memory page sizes supported by this RNIC. + * Bit position i in bitmap indicates page of + * size (4k)^i. Phys block list mode unsupported. + */ + u32 mem_pgsizes_bitmask; + u8 can_resize_wq; + + /* + * The maximum number of RDMA Reads that can be outstanding + * per QP with this RNIC as the target. + */ + u32 max_rdma_reads_per_qp; + + /* + * The maximum number of resources used for RDMA Reads + * by this RNIC with this RNIC as the target. + */ + u32 max_rdma_read_resources; + + /* + * The max depth per QP for initiation of RDMA Read + * by this RNIC. + */ + u32 max_rdma_read_qp_depth; + + /* + * The maximum depth for initiation of RDMA Read + * operations by this RNIC on all QPs + */ + u32 max_rdma_read_depth; + u8 rq_overflow_handled; + u32 can_modify_ird; + u32 can_modify_ord; + u32 max_mem_windows; + u32 stag0_value; + u8 zbva_support; + u8 local_invalidate_fence; + u32 cq_overflow_detection; +}; + +struct iwch_dev { + struct ib_device ibdev; + struct cxio_rdev rdev; + u32 device_cap_flags; + struct iwch_rnic_attributes attr; + struct kvl cqidr; + struct kvl qpidr; + struct kvl mmidr; + struct mtx lock; + TAILQ_ENTRY(iwch_dev) entry; +}; + +#ifndef container_of +#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field))) +#endif + +static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct iwch_dev, ibdev); +} + +static inline int t3b_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3B; +} + +static inline int t3a_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3A; +} + +static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid) +{ + return kvl_lookup(&rhp->cqidr, cqid); +} + +static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid) +{ + return kvl_lookup(&rhp->qpidr, qpid); +} + +static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid) +{ + return kvl_lookup(&rhp->mmidr, mmid); +} + +static inline int insert_handle(struct iwch_dev *rhp, struct kvl *kvlp, + void *handle, u32 id) +{ + int ret; + u32 newid; + + do { + mtx_lock(&rhp->lock); + ret = kvl_alloc_above(kvlp, handle, id, &newid); + WARN_ON(ret != 0); + WARN_ON(!ret && newid != id); + mtx_unlock(&rhp->lock); + } while (ret == -EAGAIN); + + return ret; +} + +static inline void remove_handle(struct iwch_dev *rhp, struct kvl *kvlp, u32 id) +{ + mtx_lock(&rhp->lock); + kvl_delete(kvlp, id); + mtx_unlock(&rhp->lock); +} + +extern struct cxgb_client t3c_client; +extern cxgb_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; +extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m); +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c new file mode 100644 index 0000000000000..cec461147ccc7 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.c @@ -0,0 +1,1779 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/uio.h> + +#include <net/route.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp.h> +#include <netinet/tcpip.h> + +#include <contrib/rdma/ib_verbs.h> + + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/tom/cxgb_tom.h> +#include <ulp/tom/cxgb_t3_ddp.h> +#include <ulp/tom/cxgb_defs.h> +#include <ulp/tom/cxgb_toepcb.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/tom/cxgb_tom.h> +#include <dev/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_defs.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#endif + +#ifdef KTR +static char *states[] = { + "idle", + "listen", + "connecting", + "mpa_wait_req", + "mpa_req_sent", + "mpa_req_rcvd", + "mpa_rep_sent", + "fpdu_mode", + "aborting", + "closing", + "moribund", + "dead", + NULL, +}; +#endif + +SYSCTL_NODE(_hw, OID_AUTO, cxgb, CTLFLAG_RD, 0, "iw_cxgb driver parameters"); + +static int ep_timeout_secs = 10; +TUNABLE_INT("hw.iw_cxgb.ep_timeout_secs", &ep_timeout_secs); +SYSCTL_UINT(_hw_cxgb, OID_AUTO, ep_timeout_secs, CTLFLAG_RDTUN, &ep_timeout_secs, 0, + "CM Endpoint operation timeout in seconds (default=10)"); + +static int mpa_rev = 1; +TUNABLE_INT("hw.iw_cxgb.mpa_rev", &mpa_rev); +SYSCTL_UINT(_hw_cxgb, OID_AUTO, mpa_rev, CTLFLAG_RDTUN, &mpa_rev, 0, + "MPA Revision, 0 supports amso1100, 1 is spec compliant. (default=1)"); + +static int markers_enabled = 0; +TUNABLE_INT("hw.iw_cxgb.markers_enabled", &markers_enabled); +SYSCTL_UINT(_hw_cxgb, OID_AUTO, markers_enabled, CTLFLAG_RDTUN, &markers_enabled, 0, + "Enable MPA MARKERS (default(0)=disabled)"); + +static int crc_enabled = 1; +TUNABLE_INT("hw.iw_cxgb.crc_enabled", &crc_enabled); +SYSCTL_UINT(_hw_cxgb, OID_AUTO, crc_enabled, CTLFLAG_RDTUN, &crc_enabled, 0, + "Enable MPA CRC (default(1)=enabled)"); + +static int rcv_win = 256 * 1024; +TUNABLE_INT("hw.iw_cxgb.rcv_win", &rcv_win); +SYSCTL_UINT(_hw_cxgb, OID_AUTO, rcv_win, CTLFLAG_RDTUN, &rcv_win, 0, + "TCP receive window in bytes (default=256KB)"); + +static int snd_win = 32 * 1024; +TUNABLE_INT("hw.iw_cxgb.snd_win", &snd_win); +SYSCTL_UINT(_hw_cxgb, OID_AUTO, snd_win, CTLFLAG_RDTUN, &snd_win, 0, + "TCP send window in bytes (default=32KB)"); + +static unsigned int nocong = 0; +TUNABLE_INT("hw.iw_cxgb.nocong", &nocong); +SYSCTL_UINT(_hw_cxgb, OID_AUTO, nocong, CTLFLAG_RDTUN, &nocong, 0, + "Turn off congestion control (default=0)"); + +static unsigned int cong_flavor = 1; +TUNABLE_INT("hw.iw_cxgb.cong_flavor", &cong_flavor); +SYSCTL_UINT(_hw_cxgb, OID_AUTO, cong_flavor, CTLFLAG_RDTUN, &cong_flavor, 0, + "TCP Congestion control flavor (default=1)"); + +static void ep_timeout(void *arg); +static void connect_reply_upcall(struct iwch_ep *ep, int status); +static void iwch_so_upcall(struct socket *so, void *arg, int waitflag); + +/* + * Cruft to offload socket upcalls onto thread. + */ +static struct mtx req_lock; +static TAILQ_HEAD(iwch_ep_list, iwch_ep_common) req_list; +static struct task iw_cxgb_task; +static struct taskqueue *iw_cxgb_taskq; +static void process_req(void *ctx, int pending); + +static void +start_ep_timer(struct iwch_ep *ep) +{ + CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); + if (callout_pending(&ep->timer)) { + CTR2(KTR_IW_CXGB, "%s stopped / restarted timer ep %p", __FUNCTION__, ep); + callout_deactivate(&ep->timer); + callout_drain(&ep->timer); + } else { + /* + * XXX this looks racy + */ + get_ep(&ep->com); + callout_init(&ep->timer, TRUE); + } + callout_reset(&ep->timer, ep_timeout_secs * hz, ep_timeout, ep); +} + +static void +stop_ep_timer(struct iwch_ep *ep) +{ + CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); + callout_drain(&ep->timer); + put_ep(&ep->com); +} + +static int set_tcpinfo(struct iwch_ep *ep) +{ + struct tcp_info ti; + struct sockopt sopt; + int err; + + sopt.sopt_dir = SOPT_GET; + sopt.sopt_level = IPPROTO_TCP; + sopt.sopt_name = TCP_INFO; + sopt.sopt_val = (caddr_t)&ti; + sopt.sopt_valsize = sizeof ti; + sopt.sopt_td = NULL; + + err = sogetopt(ep->com.so, &sopt); + if (err) { + printf("%s can't get tcpinfo\n", __FUNCTION__); + return -err; + } + if (!(ti.tcpi_options & TCPI_OPT_TOE)) { + printf("%s connection NOT OFFLOADED!\n", __FUNCTION__); + return -EINVAL; + } + + ep->snd_seq = ti.tcpi_snd_nxt; + ep->rcv_seq = ti.tcpi_rcv_nxt; + ep->emss = ti.__tcpi_snd_mss - sizeof(struct tcpiphdr); + ep->hwtid = TOEPCB(ep->com.so)->tp_tid; /* XXX */ + if (ti.tcpi_options & TCPI_OPT_TIMESTAMPS) + ep->emss -= 12; + if (ep->emss < 128) + ep->emss = 128; + return 0; +} + +static enum iwch_ep_state +state_read(struct iwch_ep_common *epc) +{ + enum iwch_ep_state state; + + mtx_lock(&epc->lock); + state = epc->state; + mtx_unlock(&epc->lock); + return state; +} + +static void +__state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) +{ + epc->state = new; +} + +static void +state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) +{ + + mtx_lock(&epc->lock); + CTR3(KTR_IW_CXGB, "%s - %s -> %s", __FUNCTION__, states[epc->state], states[new]); + __state_set(epc, new); + mtx_unlock(&epc->lock); + return; +} + +static void * +alloc_ep(int size, int flags) +{ + struct iwch_ep_common *epc; + + epc = malloc(size, M_DEVBUF, flags); + if (epc) { + memset(epc, 0, size); + refcount_init(&epc->refcount, 1); + mtx_init(&epc->lock, "iwch_epc lock", NULL, MTX_DEF|MTX_DUPOK); + cv_init(&epc->waitq, "iwch_epc cv"); + } + CTR2(KTR_IW_CXGB, "%s alloc ep %p", __FUNCTION__, epc); + return epc; +} + +void __free_ep(struct iwch_ep_common *epc) +{ + CTR3(KTR_IW_CXGB, "%s ep %p state %s", __FUNCTION__, epc, states[state_read(epc)]); + KASSERT(!epc->so, ("%s warning ep->so %p \n", __FUNCTION__, epc->so)); + KASSERT(!epc->entry.tqe_prev, ("%s epc %p still on req list!\n", __FUNCTION__, epc)); + free(epc, M_DEVBUF); +} + +int +iwch_quiesce_tid(struct iwch_ep *ep) +{ +#ifdef notyet + struct cpl_set_tcb_field *req; + struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT); + + if (m == NULL) + return (-ENOMEM); + req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE); + + m_set_priority(m, CPL_PRIORITY_DATA); + cxgb_ofld_send(ep->com.tdev, m); +#endif + return 0; +} + +int +iwch_resume_tid(struct iwch_ep *ep) +{ +#ifdef notyet + struct cpl_set_tcb_field *req; + struct mbuf *m = get_mbuf(NULL, sizeof(*req), M_NOWAIT); + + if (m == NULL) + return (-ENOMEM); + req = (struct cpl_set_tcb_field *) mbuf_put(m, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = 0; + + m_set_priority(m, CPL_PRIORITY_DATA); + cxgb_ofld_send(ep->com.tdev, m); +#endif + return 0; +} + +static struct rtentry * +find_route(__be32 local_ip, __be32 peer_ip, __be16 local_port, + __be16 peer_port, u8 tos) +{ + struct route iproute; + struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst; + + bzero(&iproute, sizeof iproute); + dst->sin_family = AF_INET; + dst->sin_len = sizeof *dst; + dst->sin_addr.s_addr = peer_ip; + + rtalloc(&iproute); + return iproute.ro_rt; +} + +static void +close_socket(struct iwch_ep_common *epc) +{ + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]); + SOCK_LOCK(epc->so); + epc->so->so_upcall = NULL; + epc->so->so_upcallarg = NULL; + epc->so->so_rcv.sb_flags &= ~SB_UPCALL; + SOCK_UNLOCK(epc->so); + soshutdown(epc->so, SHUT_WR|SHUT_RD); + epc->so = NULL; +} + +static void +shutdown_socket(struct iwch_ep_common *epc) +{ + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, epc, epc->so, states[epc->state]); + soshutdown(epc->so, SHUT_WR); +} + +static void +abort_socket(struct iwch_ep *ep) +{ + struct sockopt sopt; + int err; + struct linger l; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + l.l_onoff = 1; + l.l_linger = 0; + + /* linger_time of 0 forces RST to be sent */ + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_LINGER; + sopt.sopt_val = (caddr_t)&l; + sopt.sopt_valsize = sizeof l; + sopt.sopt_td = NULL; + err = sosetopt(ep->com.so, &sopt); + if (err) + printf("%s can't set linger to 0, no RST! err %d\n", __FUNCTION__, err); +} + +static void +send_mpa_req(struct iwch_ep *ep) +{ + int mpalen; + struct mpa_message *mpa; + struct mbuf *m; + int err; + + CTR3(KTR_IW_CXGB, "%s ep %p pd_len %d", __FUNCTION__, ep, ep->plen); + + mpalen = sizeof(*mpa) + ep->plen; + m = m_gethdr(mpalen, M_NOWAIT); + if (m == NULL) { + connect_reply_upcall(ep, -ENOMEM); + return; + } + mpa = mtod(m, struct mpa_message *); + m->m_len = mpalen; + m->m_pkthdr.len = mpalen; + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); + mpa->flags = (crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->private_data_size = htons(ep->plen); + mpa->revision = mpa_rev; + if (ep->plen) + memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); + + err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); + if (err) { + m_freem(m); + connect_reply_upcall(ep, -ENOMEM); + return; + } + + start_ep_timer(ep); + state_set(&ep->com, MPA_REQ_SENT); + return; +} + +static int +send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct mpa_message *mpa; + struct mbuf *m; + int err; + + CTR3(KTR_IW_CXGB, "%s ep %p plen %d", __FUNCTION__, ep, plen); + + mpalen = sizeof(*mpa) + plen; + + m = m_gethdr(mpalen, M_NOWAIT); + if (m == NULL) { + printf("%s - cannot alloc mbuf!\n", __FUNCTION__); + return (-ENOMEM); + } + mpa = mtod(m, struct mpa_message *); + m->m_len = mpalen; + m->m_pkthdr.len = mpalen; + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = MPA_REJECT; + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + err = sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, ep->com.thread); + PANIC_IF(err); + return 0; +} + +static int +send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct mpa_message *mpa; + struct mbuf *m; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p plen %d", __FUNCTION__, ep, ep->com.so, plen); + + mpalen = sizeof(*mpa) + plen; + + m = m_gethdr(mpalen, M_NOWAIT); + if (m == NULL) { + printf("%s - cannot alloc mbuf!\n", __FUNCTION__); + return (-ENOMEM); + } + mpa = mtod(m, struct mpa_message *); + m->m_len = mpalen; + m->m_pkthdr.len = mpalen; + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + + state_set(&ep->com, MPA_REP_SENT); + return sosend(ep->com.so, NULL, NULL, m, NULL, MSG_DONTWAIT, + ep->com.thread); +} + +static void +close_complete_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + if (ep->com.cm_id) { + CTR3(KTR_IW_CXGB, "close complete delivered ep %p cm_id %p tid %d", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void +abort_connection(struct iwch_ep *ep) +{ + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + state_set(&ep->com, ABORTING); + abort_socket(ep); + close_socket(&ep->com); + close_complete_upcall(ep); + state_set(&ep->com, DEAD); + put_ep(&ep->com); +} + +static void +peer_close_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_DISCONNECT; + if (ep->com.cm_id) { + CTR3(KTR_IW_CXGB, "peer close delivered ep %p cm_id %p tid %d", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static void +peer_abort_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + event.status = ECONNRESET; + if (ep->com.cm_id) { + CTR3(KTR_IW_CXGB, "abort delivered ep %p cm_id %p tid %d", ep, + ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void +connect_reply_upcall(struct iwch_ep *ep, int status) +{ + struct iw_cm_event event; + + CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], status); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REPLY; + event.status = status; + event.local_addr = ep->com.local_addr; + event.remote_addr = ep->com.remote_addr; + + if ((status == 0) || (status == ECONNREFUSED)) { + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + } + if (ep->com.cm_id) { + CTR4(KTR_IW_CXGB, "%s ep %p tid %d status %d", __FUNCTION__, ep, + ep->hwtid, status); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } + if (status < 0) { + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void +connect_request_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REQUEST; + event.local_addr = ep->com.local_addr; + event.remote_addr = ep->com.remote_addr; + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + event.provider_data = ep; + event.so = ep->com.so; + if (state_read(&ep->parent_ep->com) != DEAD) + ep->parent_ep->com.cm_id->event_handler( + ep->parent_ep->com.cm_id, + &event); + put_ep(&ep->parent_ep->com); + ep->parent_ep = NULL; +} + +static void +established_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_ESTABLISHED; + if (ep->com.cm_id) { + CTR3(KTR_IW_CXGB, "%s ep %p tid %d", __FUNCTION__, ep, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static void +process_mpa_reply(struct iwch_ep *ep) +{ + struct mpa_message *mpa; + u16 plen; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + int err; + struct mbuf *top, *m; + int flags = MSG_DONTWAIT; + struct uio uio; + int len; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_SENT) + return; + + uio.uio_resid = len = 1000000; + uio.uio_td = ep->com.thread; + err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags); + if (err) { + if (err == EWOULDBLOCK) { + start_ep_timer(ep); + return; + } + err = -err; + goto err; + } + + if (ep->com.so->so_rcv.sb_mb) { + printf("%s data after soreceive called! so %p sb_mb %p top %p\n", + __FUNCTION__, ep->com.so, ep->com.so->so_rcv.sb_mb, top); + } + + m = top; + do { + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) { + err = (-EINVAL); + goto err; + } + + /* + * copy the new data into our accumulation buffer. + */ + m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len])); + ep->mpa_pkt_len += m->m_len; + if (!m->m_next) + m = m->m_nextpkt; + else + m = m->m_next; + } while (m); + + m_freem(top); + + /* + * if we don't even have the mpa message, then bail. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + mpa = (struct mpa_message *)ep->mpa_pkt; + + /* Validate MPA header. */ + if (mpa->revision != mpa_rev) { + CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision); + err = EPROTO; + goto err; + } + if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { + CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key); + err = EPROTO; + goto err; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen); + err = EPROTO; + goto err; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + CTR2(KTR_IW_CXGB, "%s pkt too big %d", __FUNCTION__, ep->mpa_pkt_len); + err = EPROTO; + goto err; + } + + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + if (mpa->flags & MPA_REJECT) { + err = ECONNREFUSED; + goto err; + } + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. And + * the MPA header is valid. + */ + CTR1(KTR_IW_CXGB, "%s mpa rpl looks good!", __FUNCTION__); + state_set(&ep->com, FPDU_MODE); + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + if (set_tcpinfo(ep)) { + printf("%s set_tcpinfo error\n", __FUNCTION__); + goto err; + } + CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d", __FUNCTION__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD; + + /* bind QP and TID with INIT_WR */ + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (!err) + goto out; +err: + abort_connection(ep); +out: + connect_reply_upcall(ep, err); + return; +} + +static void +process_mpa_request(struct iwch_ep *ep) +{ + struct mpa_message *mpa; + u16 plen; + int flags = MSG_DONTWAIT; + struct mbuf *top, *m; + int err; + struct uio uio; + int len; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_WAIT) + return; + + uio.uio_resid = len = 1000000; + uio.uio_td = ep->com.thread; + err = soreceive(ep->com.so, NULL, &uio, &top, NULL, &flags); + if (err) { + if (err == EWOULDBLOCK) { + start_ep_timer(ep); + return; + } + err = -err; + goto err; + } + + m = top; + do { + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + m->m_len > sizeof(ep->mpa_pkt)) { + CTR2(KTR_IW_CXGB, "%s mpa message too big %d", __FUNCTION__, + ep->mpa_pkt_len + m->m_len); + goto err; + } + + + /* + * Copy the new data into our accumulation buffer. + */ + m_copydata(m, 0, m->m_len, &(ep->mpa_pkt[ep->mpa_pkt_len])); + ep->mpa_pkt_len += m->m_len; + + if (!m->m_next) + m = m->m_nextpkt; + else + m = m->m_next; + } while (m); + + m_freem(top); + + /* + * If we don't even have the mpa message, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) { + start_ep_timer(ep); + CTR2(KTR_IW_CXGB, "%s not enough header %d...waiting...", __FUNCTION__, + ep->mpa_pkt_len); + return; + } + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* + * Validate MPA Header. + */ + if (mpa->revision != mpa_rev) { + CTR2(KTR_IW_CXGB, "%s bad mpa rev %d", __FUNCTION__, mpa->revision); + goto err; + } + + if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { + CTR2(KTR_IW_CXGB, "%s bad mpa key |%16s|", __FUNCTION__, mpa->key); + goto err; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + CTR2(KTR_IW_CXGB, "%s plen too big %d", __FUNCTION__, plen); + goto err; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + CTR2(KTR_IW_CXGB, "%s more data after private data %d", __FUNCTION__, + ep->mpa_pkt_len); + goto err; + } + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) { + start_ep_timer(ep); + CTR2(KTR_IW_CXGB, "%s more mpa msg to come %d", __FUNCTION__, + ep->mpa_pkt_len); + return; + } + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. + */ + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + if (set_tcpinfo(ep)) { + printf("%s set_tcpinfo error\n", __FUNCTION__); + goto err; + } + CTR5(KTR_IW_CXGB, "%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d", __FUNCTION__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + connect_request_upcall(ep); + return; +err: + abort_connection(ep); + return; +} + +static void +process_peer_close(struct iwch_ep *ep) +{ + struct iwch_qp_attributes attrs; + int disconnect = 1; + int release = 0; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + + mtx_lock(&ep->com.lock); + switch (ep->com.state) { + case MPA_REQ_WAIT: + __state_set(&ep->com, CLOSING); + break; + case MPA_REQ_SENT: + __state_set(&ep->com, CLOSING); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. + */ + __state_set(&ep->com, CLOSING); + get_ep(&ep->com); + break; + case MPA_REP_SENT: + __state_set(&ep->com, CLOSING); + break; + case FPDU_MODE: + start_ep_timer(ep); + __state_set(&ep->com, CLOSING); + attrs.next_state = IWCH_QP_STATE_CLOSING; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + peer_close_upcall(ep); + break; + case ABORTING: + disconnect = 0; + break; + case CLOSING: + __state_set(&ep->com, MORIBUND); + disconnect = 0; + break; + case MORIBUND: + stop_ep_timer(ep); + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + } + close_socket(&ep->com); + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + disconnect = 0; + break; + case DEAD: + disconnect = 0; + break; + default: + PANIC_IF(1); + } + mtx_unlock(&ep->com.lock); + if (disconnect) + iwch_ep_disconnect(ep, 0, M_NOWAIT); + if (release) + put_ep(&ep->com); + return; +} + +static void +process_conn_error(struct iwch_ep *ep) +{ + struct iwch_qp_attributes attrs; + int ret; + int state; + + state = state_read(&ep->com); + CTR5(KTR_IW_CXGB, "%s ep %p so %p so->so_error %u state %s", __FUNCTION__, ep, ep->com.so, ep->com.so->so_error, states[ep->com.state]); + switch (state) { + case MPA_REQ_WAIT: + stop_ep_timer(ep); + break; + case MPA_REQ_SENT: + stop_ep_timer(ep); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REP_SENT: + ep->com.rpl_err = ECONNRESET; + CTR1(KTR_IW_CXGB, "waking up ep %p", ep); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. + */ + get_ep(&ep->com); + break; + case MORIBUND: + case CLOSING: + stop_ep_timer(ep); + /*FALLTHROUGH*/ + case FPDU_MODE: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_ERROR; + ret = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (ret) + log(LOG_ERR, + "%s - qp <- error failed!\n", + __FUNCTION__); + } + peer_abort_upcall(ep); + break; + case ABORTING: + break; + case DEAD: + CTR2(KTR_IW_CXGB, "%s so_error %d IN DEAD STATE!!!!", __FUNCTION__, + ep->com.so->so_error); + return; + default: + PANIC_IF(1); + break; + } + + if (state != ABORTING) { + close_socket(&ep->com); + state_set(&ep->com, DEAD); + put_ep(&ep->com); + } + return; +} + +static void +process_close_complete(struct iwch_ep *ep) +{ + struct iwch_qp_attributes attrs; + int release = 0; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + PANIC_IF(!ep); + + /* The cm_id may be null if we failed to connect */ + mtx_lock(&ep->com.lock); + switch (ep->com.state) { + case CLOSING: + __state_set(&ep->com, MORIBUND); + break; + case MORIBUND: + stop_ep_timer(ep); + if ((ep->com.cm_id) && (ep->com.qp)) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + close_socket(&ep->com); + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + break; + case ABORTING: + break; + case DEAD: + default: + PANIC_IF(1); + break; + } + mtx_unlock(&ep->com.lock); + if (release) + put_ep(&ep->com); + return; +} + +/* + * T3A does 3 things when a TERM is received: + * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet + * 2) generate an async event on the QP with the TERMINATE opcode + * 3) post a TERMINATE opcde cqe into the associated CQ. + * + * For (1), we save the message in the qp for later consumer consumption. + * For (2), we move the QP into TERMINATE, post a QP event and disconnect. + * For (3), we toss the CQE in cxio_poll_cq(). + * + * terminate() handles case (1)... + */ +static int +terminate(struct t3cdev *tdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + struct socket *so = toeptoso(toep); + struct iwch_ep *ep = so->so_upcallarg; + + CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); + m_adj(m, sizeof(struct cpl_rdma_terminate)); + CTR2(KTR_IW_CXGB, "%s saving %d bytes of term msg", __FUNCTION__, m->m_len); + m_copydata(m, 0, m->m_len, ep->com.qp->attr.terminate_buffer); + ep->com.qp->attr.terminate_msg_len = m->m_len; + ep->com.qp->attr.is_terminate_local = 0; + return CPL_RET_BUF_DONE; +} + +static int +ec_status(struct t3cdev *tdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + struct socket *so = toeptoso(toep); + struct cpl_rdma_ec_status *rep = cplhdr(m); + struct iwch_ep *ep; + struct iwch_qp_attributes attrs; + int release = 0; + + ep = so->so_upcallarg; + CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s ec_status %d", __FUNCTION__, ep, ep->com.so, states[ep->com.state], rep->status); + if (!so || !ep) { + panic("bogosity ep %p state %d, so %p state %x\n", ep, ep ? ep->com.state : -1, so, so ? so->so_state : -1); + } + mtx_lock(&ep->com.lock); + switch (ep->com.state) { + case CLOSING: + if (!rep->status) + __state_set(&ep->com, MORIBUND); + else + __state_set(&ep->com, ABORTING); + break; + case MORIBUND: + stop_ep_timer(ep); + if (!rep->status) { + if ((ep->com.cm_id) && (ep->com.qp)) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + close_socket(&ep->com); + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + } + break; + case DEAD: + break; + default: + panic("unknown state: %d\n", ep->com.state); + } + mtx_unlock(&ep->com.lock); + if (rep->status) { + log(LOG_ERR, "%s BAD CLOSE - Aborting tid %u\n", + __FUNCTION__, ep->hwtid); + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + if (release) + put_ep(&ep->com); + return CPL_RET_BUF_DONE; +} + +static void +ep_timeout(void *arg) +{ + struct iwch_ep *ep = (struct iwch_ep *)arg; + struct iwch_qp_attributes attrs; + int err = 0; + + mtx_lock(&ep->com.lock); + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + switch (ep->com.state) { + case MPA_REQ_SENT: + connect_reply_upcall(ep, -ETIMEDOUT); + break; + case MPA_REQ_WAIT: + break; + case CLOSING: + case MORIBUND: + if (ep->com.cm_id && ep->com.qp) + err = 1; + break; + default: + panic("unknown state: %d\n", ep->com.state); + } + __state_set(&ep->com, ABORTING); + mtx_unlock(&ep->com.lock); + if (err){ + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + abort_connection(ep); + put_ep(&ep->com); +} + +int +iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err; + struct iwch_ep *ep = to_ep(cm_id); + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + + if (state_read(&ep->com) == DEAD) { + put_ep(&ep->com); + return (-ECONNRESET); + } + PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD); + if (mpa_rev == 0) { + abort_connection(ep); + } else { + err = send_mpa_reject(ep, pdata, pdata_len); + err = soshutdown(ep->com.so, 3); + } + return 0; +} + +int +iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + struct iwch_ep *ep = to_ep(cm_id); + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_qp *qp = get_qhp(h, conn_param->qpn); + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + if (state_read(&ep->com) == DEAD) + return (-ECONNRESET); + + PANIC_IF(state_read(&ep->com) != MPA_REQ_RCVD); + PANIC_IF(!qp); + + if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) || + (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) { + abort_connection(ep); + return (-EINVAL); + } + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = qp; + + ep->com.rpl_err = 0; + ep->com.rpl_done = 0; + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + CTR3(KTR_IW_CXGB, "%s ird %d ord %d", __FUNCTION__, ep->ird, ep->ord); + get_ep(&ep->com); + + /* bind QP to EP and move to RTS */ + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ord; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + /* bind QP and TID with INIT_WR */ + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_MAX_ORD; + + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + + if (err) + goto err; + + err = send_mpa_reply(ep, conn_param->private_data, + conn_param->private_data_len); + if (err) + goto err; + state_set(&ep->com, FPDU_MODE); + established_upcall(ep); + put_ep(&ep->com); + return 0; +err: + ep->com.cm_id = NULL; + ep->com.qp = NULL; + cm_id->rem_ref(cm_id); + put_ep(&ep->com); + return err; +} + +static int init_sock(struct iwch_ep_common *epc) +{ + int err; + struct sockopt sopt; + int on=1; + + epc->so->so_upcall = iwch_so_upcall; + epc->so->so_upcallarg = epc; + epc->so->so_rcv.sb_flags |= SB_UPCALL; + epc->so->so_state |= SS_NBIO; + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_NO_DDP; + sopt.sopt_val = (caddr_t)&on; + sopt.sopt_valsize = sizeof on; + sopt.sopt_td = NULL; + err = sosetopt(epc->so, &sopt); + if (err) + printf("%s can't set SO_NO_DDP err %d\n", __FUNCTION__, err); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = IPPROTO_TCP; + sopt.sopt_name = TCP_NODELAY; + sopt.sopt_val = (caddr_t)&on; + sopt.sopt_valsize = sizeof on; + sopt.sopt_td = NULL; + err = sosetopt(epc->so, &sopt); + if (err) + printf("%s can't set TCP_NODELAY err %d\n", __FUNCTION__, err); + + return 0; +} + +static int +is_loopback_dst(struct iw_cm_id *cm_id) +{ + uint16_t port = cm_id->remote_addr.sin_port; + struct ifaddr *ifa; + + cm_id->remote_addr.sin_port = 0; + ifa = ifa_ifwithaddr((struct sockaddr *)&cm_id->remote_addr); + cm_id->remote_addr.sin_port = port; + return (ifa != NULL); +} + +int +iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err = 0; + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_ep *ep; + struct rtentry *rt; + struct toedev *tdev; + + if (is_loopback_dst(cm_id)) { + err = -ENOSYS; + goto out; + } + + ep = alloc_ep(sizeof(*ep), M_NOWAIT); + if (!ep) { + printf("%s - cannot alloc ep.\n", __FUNCTION__); + err = (-ENOMEM); + goto out; + } + callout_init(&ep->timer, TRUE); + ep->plen = conn_param->private_data_len; + if (ep->plen) + memcpy(ep->mpa_pkt + sizeof(struct mpa_message), + conn_param->private_data, ep->plen); + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = get_qhp(h, conn_param->qpn); + ep->com.thread = curthread; + PANIC_IF(!ep->com.qp); + CTR4(KTR_IW_CXGB, "%s qpn 0x%x qp %p cm_id %p", __FUNCTION__, conn_param->qpn, + ep->com.qp, cm_id); + + ep->com.so = cm_id->so; + err = init_sock(&ep->com); + if (err) + goto fail2; + + /* find a route */ + rt = find_route(cm_id->local_addr.sin_addr.s_addr, + cm_id->remote_addr.sin_addr.s_addr, + cm_id->local_addr.sin_port, + cm_id->remote_addr.sin_port, IPTOS_LOWDELAY); + if (!rt) { + printf("%s - cannot find route.\n", __FUNCTION__); + err = EHOSTUNREACH; + goto fail2; + } + + if (!(rt->rt_ifp->if_flags & IFCAP_TOE)) { + printf("%s - interface not TOE capable.\n", __FUNCTION__); + goto fail3; + } + tdev = TOEDEV(rt->rt_ifp); + if (tdev == NULL) { + printf("%s - No toedev for interface.\n", __FUNCTION__); + goto fail3; + } + if (!tdev->tod_can_offload(tdev, ep->com.so)) { + printf("%s - interface cannot offload!.\n", __FUNCTION__); + goto fail3; + } + RTFREE(rt); + + state_set(&ep->com, CONNECTING); + ep->com.local_addr = cm_id->local_addr; + ep->com.remote_addr = cm_id->remote_addr; + err = soconnect(ep->com.so, (struct sockaddr *)&ep->com.remote_addr, + ep->com.thread); + if (!err) + goto out; +fail3: + RTFREE(ep->dst); +fail2: + put_ep(&ep->com); +out: + return err; +} + +int +iwch_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + int err = 0; + struct iwch_listen_ep *ep; + + ep = alloc_ep(sizeof(*ep), M_NOWAIT); + if (!ep) { + printf("%s - cannot alloc ep.\n", __FUNCTION__); + err = ENOMEM; + goto out; + } + CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->backlog = backlog; + ep->com.local_addr = cm_id->local_addr; + ep->com.thread = curthread; + state_set(&ep->com, LISTEN); + + ep->com.so = cm_id->so; + err = init_sock(&ep->com); + if (err) + goto fail; + + err = solisten(ep->com.so, ep->backlog, ep->com.thread); + if (!err) { + cm_id->provider_data = ep; + goto out; + } + close_socket(&ep->com); +fail: + cm_id->rem_ref(cm_id); + put_ep(&ep->com); +out: + return err; +} + +int +iwch_destroy_listen(struct iw_cm_id *cm_id) +{ + struct iwch_listen_ep *ep = to_listen_ep(cm_id); + + CTR2(KTR_IW_CXGB, "%s ep %p", __FUNCTION__, ep); + + state_set(&ep->com, DEAD); + close_socket(&ep->com); + cm_id->rem_ref(cm_id); + put_ep(&ep->com); + return 0; +} + +int +iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags) +{ + int close = 0; + + mtx_lock(&ep->com.lock); + + PANIC_IF(!ep); + PANIC_IF(!ep->com.so); + + CTR5(KTR_IW_CXGB, "%s ep %p so %p state %s, abrupt %d", __FUNCTION__, ep, + ep->com.so, states[ep->com.state], abrupt); + + if (ep->com.state == DEAD) { + CTR2(KTR_IW_CXGB, "%s already dead ep %p", __FUNCTION__, ep); + goto out; + } + + if (abrupt) { + if (ep->com.state != ABORTING) { + ep->com.state = ABORTING; + close = 1; + } + goto out; + } + + switch (ep->com.state) { + case MPA_REQ_WAIT: + case MPA_REQ_SENT: + case MPA_REQ_RCVD: + case MPA_REP_SENT: + case FPDU_MODE: + start_ep_timer(ep); + ep->com.state = CLOSING; + close = 1; + break; + case CLOSING: + ep->com.state = MORIBUND; + close = 1; + break; + case MORIBUND: + case ABORTING: + break; + default: + panic("unknown state: %d\n", ep->com.state); + break; + } +out: + mtx_unlock(&ep->com.lock); + if (close) { + if (abrupt) + abort_connection(ep); + else + shutdown_socket(&ep->com); + } + return 0; +} + +static void +process_data(struct iwch_ep *ep) +{ + struct sockaddr_in *local, *remote; + + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + + switch (state_read(&ep->com)) { + case MPA_REQ_SENT: + process_mpa_reply(ep); + break; + case MPA_REQ_WAIT: + + /* + * XXX + * Set local and remote addrs here because when we + * dequeue the newly accepted socket, they aren't set + * yet in the pcb! + */ + in_getsockaddr(ep->com.so, (struct sockaddr **)&local); + in_getpeeraddr(ep->com.so, (struct sockaddr **)&remote); + CTR3(KTR_IW_CXGB, "%s local %s remote %s", __FUNCTION__, + inet_ntoa(local->sin_addr), + inet_ntoa(remote->sin_addr)); + ep->com.local_addr = *local; + ep->com.remote_addr = *remote; + free(local, M_SONAME); + free(remote, M_SONAME); + process_mpa_request(ep); + break; + default: + if (ep->com.so->so_rcv.sb_cc) + printf("%s Unexpected streaming data." + " ep %p state %d so %p so_state %x so_rcv.sb_cc %u so_rcv.sb_mb %p\n", + __FUNCTION__, ep, state_read(&ep->com), ep->com.so, ep->com.so->so_state, + ep->com.so->so_rcv.sb_cc, ep->com.so->so_rcv.sb_mb); + break; + } + return; +} + +static void +process_connected(struct iwch_ep *ep) +{ + CTR4(KTR_IW_CXGB, "%s ep %p so %p state %s", __FUNCTION__, ep, ep->com.so, states[ep->com.state]); + if ((ep->com.so->so_state & SS_ISCONNECTED) && !ep->com.so->so_error) { + send_mpa_req(ep); + } else { + connect_reply_upcall(ep, -ep->com.so->so_error); + close_socket(&ep->com); + state_set(&ep->com, DEAD); + put_ep(&ep->com); + } +} + +static struct socket * +dequeue_socket(struct socket *head, struct sockaddr_in **remote, struct iwch_ep *child_ep) +{ + struct socket *so; + + ACCEPT_LOCK(); + so = TAILQ_FIRST(&head->so_comp); + if (!so) { + ACCEPT_UNLOCK(); + return NULL; + } + TAILQ_REMOVE(&head->so_comp, so, so_list); + head->so_qlen--; + SOCK_LOCK(so); + so->so_qstate &= ~SQ_COMP; + so->so_head = NULL; + soref(so); + so->so_rcv.sb_flags |= SB_UPCALL; + so->so_state |= SS_NBIO; + so->so_upcall = iwch_so_upcall; + so->so_upcallarg = child_ep; + PANIC_IF(!(so->so_state & SS_ISCONNECTED)); + PANIC_IF(so->so_error); + SOCK_UNLOCK(so); + ACCEPT_UNLOCK(); + soaccept(so, (struct sockaddr **)remote); + return so; +} + +static void +process_newconn(struct iwch_ep *parent_ep) +{ + struct socket *child_so; + struct iwch_ep *child_ep; + struct sockaddr_in *remote; + + CTR3(KTR_IW_CXGB, "%s parent ep %p so %p", __FUNCTION__, parent_ep, parent_ep->com.so); + child_ep = alloc_ep(sizeof(*child_ep), M_NOWAIT); + if (!child_ep) { + log(LOG_ERR, "%s - failed to allocate ep entry!\n", + __FUNCTION__); + return; + } + child_so = dequeue_socket(parent_ep->com.so, &remote, child_ep); + if (!child_so) { + log(LOG_ERR, "%s - failed to dequeue child socket!\n", + __FUNCTION__); + __free_ep(&child_ep->com); + return; + } + CTR3(KTR_IW_CXGB, "%s remote addr %s port %d", __FUNCTION__, + inet_ntoa(remote->sin_addr), ntohs(remote->sin_port)); + child_ep->com.so = child_so; + child_ep->com.cm_id = NULL; + child_ep->com.thread = parent_ep->com.thread; + child_ep->parent_ep = parent_ep; + free(remote, M_SONAME); + get_ep(&parent_ep->com); + child_ep->parent_ep = parent_ep; + callout_init(&child_ep->timer, TRUE); + state_set(&child_ep->com, MPA_REQ_WAIT); + start_ep_timer(child_ep); + + /* maybe the request has already been queued up on the socket... */ + process_mpa_request(child_ep); +} + +static void +iwch_so_upcall(struct socket *so, void *arg, int waitflag) +{ + struct iwch_ep *ep = arg; + + CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]); + mtx_lock(&req_lock); + if (ep && ep->com.so && !ep->com.entry.tqe_prev) { + get_ep(&ep->com); + TAILQ_INSERT_TAIL(&req_list, &ep->com, entry); + taskqueue_enqueue(iw_cxgb_taskq, &iw_cxgb_task); + } + mtx_unlock(&req_lock); +} + +static void +process_socket_event(struct iwch_ep *ep) +{ + int state = state_read(&ep->com); + struct socket *so = ep->com.so; + + CTR6(KTR_IW_CXGB, "%s so %p so state %x ep %p ep state(%d)=%s", __FUNCTION__, so, so->so_state, ep, ep->com.state, states[ep->com.state]); + if (state == CONNECTING) { + process_connected(ep); + return; + } + + if (state == LISTEN) { + process_newconn(ep); + return; + } + + /* connection error */ + if (so->so_error) { + process_conn_error(ep); + return; + } + + /* peer close */ + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && state < CLOSING) { + process_peer_close(ep); + return; + } + + /* close complete */ + if (so->so_state & (SS_ISDISCONNECTED)) { + process_close_complete(ep); + return; + } + + /* rx data */ + process_data(ep); + return; +} + +static void +process_req(void *ctx, int pending) +{ + struct iwch_ep_common *epc; + + CTR1(KTR_IW_CXGB, "%s enter", __FUNCTION__); + mtx_lock(&req_lock); + while (!TAILQ_EMPTY(&req_list)) { + epc = TAILQ_FIRST(&req_list); + TAILQ_REMOVE(&req_list, epc, entry); + epc->entry.tqe_prev = NULL; + mtx_unlock(&req_lock); + if (epc->so) + process_socket_event((struct iwch_ep *)epc); + put_ep(epc); + mtx_lock(&req_lock); + } + mtx_unlock(&req_lock); +} + +int +iwch_cm_init(void) +{ + TAILQ_INIT(&req_list); + mtx_init(&req_lock, "iw_cxgb req_list lock", NULL, MTX_DEF); + iw_cxgb_taskq = taskqueue_create("iw_cxgb_taskq", M_NOWAIT, + taskqueue_thread_enqueue, &iw_cxgb_taskq); + if (iw_cxgb_taskq == NULL) { + printf("failed to allocate iw_cxgb taskqueue\n"); + return (ENOMEM); + } + taskqueue_start_threads(&iw_cxgb_taskq, 1, PI_NET, "iw_cxgb taskq"); + TASK_INIT(&iw_cxgb_task, 0, process_req, NULL); + t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, terminate); + t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, ec_status); + return 0; +} + +void +iwch_cm_term(void) +{ + t3tom_register_cpl_handler(CPL_RDMA_TERMINATE, NULL); + t3tom_register_cpl_handler(CPL_RDMA_EC_STATUS, NULL); + taskqueue_drain(iw_cxgb_taskq, &iw_cxgb_task); + taskqueue_free(iw_cxgb_taskq); +} + diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h new file mode 100644 index 0000000000000..4250be33300ad --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h @@ -0,0 +1,249 @@ +/************************************************************************** + +Copyright (c) 2007, 2008 Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ + +#ifndef _IWCH_CM_H_ +#define _IWCH_CM_H_ +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/iw_cm.h> +#include <sys/refcount.h> +#include <sys/condvar.h> +#include <sys/proc.h> + + +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" + +#define MPA_MAX_PRIVATE_DATA 256 +#define MPA_REV o0 /* XXX - amso1100 uses rev 0 ! */ +#define MPA_REJECT 0x20 +#define MPA_CRC 0x40 +#define MPA_MARKERS 0x80 +#define MPA_FLAGS_MASK 0xE0 + +#define put_ep(ep) { \ + CTR4(KTR_IW_CXGB, "put_ep (via %s:%u) ep %p refcnt %d\n", __FUNCTION__, __LINE__, \ + ep, atomic_load_acq_int(&((ep)->refcount))); \ + if (refcount_release(&((ep)->refcount))) \ + __free_ep(ep); \ +} + +#define get_ep(ep) { \ + CTR4(KTR_IW_CXGB, "get_ep (via %s:%u) ep %p, refcnt %d\n", __FUNCTION__, __LINE__, \ + ep, atomic_load_acq_int(&((ep)->refcount))); \ + refcount_acquire(&((ep)->refcount)); \ +} + +struct mpa_message { + u8 key[16]; + u8 flags; + u8 revision; + __be16 private_data_size; + u8 private_data[0]; +}; + +struct terminate_message { + u8 layer_etype; + u8 ecode; + __be16 hdrct_rsvd; + u8 len_hdrs[0]; +}; + +#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) + +enum iwch_layers_types { + LAYER_RDMAP = 0x00, + LAYER_DDP = 0x10, + LAYER_MPA = 0x20, + RDMAP_LOCAL_CATA = 0x00, + RDMAP_REMOTE_PROT = 0x01, + RDMAP_REMOTE_OP = 0x02, + DDP_LOCAL_CATA = 0x00, + DDP_TAGGED_ERR = 0x01, + DDP_UNTAGGED_ERR = 0x02, + DDP_LLP = 0x03 +}; + +enum iwch_rdma_ecodes { + RDMAP_INV_STAG = 0x00, + RDMAP_BASE_BOUNDS = 0x01, + RDMAP_ACC_VIOL = 0x02, + RDMAP_STAG_NOT_ASSOC = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_VERS = 0x05, + RDMAP_INV_OPCODE = 0x06, + RDMAP_STREAM_CATA = 0x07, + RDMAP_GLOBAL_CATA = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum iwch_ddp_ecodes { + DDPT_INV_STAG = 0x00, + DDPT_BASE_BOUNDS = 0x01, + DDPT_STAG_NOT_ASSOC = 0x02, + DDPT_TO_WRAP = 0x03, + DDPT_INV_VERS = 0x04, + DDPU_INV_QN = 0x01, + DDPU_INV_MSN_NOBUF = 0x02, + DDPU_INV_MSN_RANGE = 0x03, + DDPU_INV_MO = 0x04, + DDPU_MSG_TOOBIG = 0x05, + DDPU_INV_VERS = 0x06 +}; + +enum iwch_mpa_ecodes { + MPA_CRC_ERR = 0x02, + MPA_MARKER_ERR = 0x03 +}; + +enum iwch_ep_state { + IDLE = 0, + LISTEN, + CONNECTING, + MPA_REQ_WAIT, + MPA_REQ_SENT, + MPA_REQ_RCVD, + MPA_REP_SENT, + FPDU_MODE, + ABORTING, + CLOSING, + MORIBUND, + DEAD, +}; + +enum iwch_ep_flags { + PEER_ABORT_IN_PROGRESS = (1 << 0), + ABORT_REQ_IN_PROGRESS = (1 << 1), +}; + +struct iwch_ep_common { + TAILQ_ENTRY(iwch_ep_common) entry; + struct iw_cm_id *cm_id; + struct iwch_qp *qp; + struct t3cdev *tdev; + enum iwch_ep_state state; + u_int refcount; + struct cv waitq; + struct mtx lock; + struct sockaddr_in local_addr; + struct sockaddr_in remote_addr; + int rpl_err; + int rpl_done; + struct thread *thread; + struct socket *so; +}; + +struct iwch_listen_ep { + struct iwch_ep_common com; + unsigned int stid; + int backlog; +}; + +struct iwch_ep { + struct iwch_ep_common com; + struct iwch_ep *parent_ep; + struct callout timer; + unsigned int atid; + u32 hwtid; + u32 snd_seq; + u32 rcv_seq; + struct l2t_entry *l2t; + struct rtentry *dst; + struct mbuf *mpa_mbuf; + struct iwch_mpa_attributes mpa_attr; + unsigned int mpa_pkt_len; + u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; + u8 tos; + u16 emss; + u16 plen; + u32 ird; + u32 ord; + u32 flags; +}; + +static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline int compute_wscale(int win) +{ + int wscale = 0; + + while (wscale < 14 && (65535<<wscale) < win) + wscale++; + return wscale; +} + +static __inline void +iwch_wait(struct cv *cv, struct mtx *lock, int *rpl_done) +{ + mtx_lock(lock); + if (!*rpl_done) { + CTR0(KTR_IW_CXGB, "sleeping for rpl_done\n"); + cv_wait_unlock(cv, lock); + } + CTR1(KTR_IW_CXGB, "*rpl_done=%d\n", *rpl_done); +} + +static __inline void +iwch_wakeup(struct cv *cv, struct mtx *lock, int *rpl_done) +{ + mtx_lock(lock); + *rpl_done=1; + CTR0(KTR_IW_CXGB, "wakeup for rpl_done\n"); + cv_broadcast(cv); + mtx_unlock(lock); +} + +/* CM prototypes */ + +int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); +int iwch_create_listen(struct iw_cm_id *cm_id, int backlog); +int iwch_destroy_listen(struct iw_cm_id *cm_id); +int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); +int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); +int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, int flags); +int iwch_quiesce_tid(struct iwch_ep *ep); +int iwch_resume_tid(struct iwch_ep *ep); +void __free_ep(struct iwch_ep_common *ep); +void iwch_rearp(struct iwch_ep *ep); +int iwch_ep_redirect(void *ctx, struct rtentry *old, struct rtentry *new, struct l2t_entry *l2t); + +int iwch_cm_init(void); +void iwch_cm_term(void); + +#endif /* _IWCH_CM_H_ */ diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c new file mode 100644 index 0000000000000..93a9e89fd39d2 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_cq.c @@ -0,0 +1,276 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/libkern.h> + +#include <netinet/in.h> +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/ib_umem.h> +#include <contrib/rdma/ib_user_verbs.h> + + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#include <ulp/iw_cxgb/iw_cxgb_resource.h> +#include <ulp/iw_cxgb/iw_cxgb_user.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h> +#endif + +/* + * Get one cq entry from cxio and map it to openib. + * + * Returns: + * 0 cqe returned + * -ENOBUFS EMPTY; + * -EAGAIN caller must try again + * any other neg errno fatal error + */ +static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp, + struct ib_wc *wc) +{ + struct iwch_qp *qhp = NULL; + struct t3_cqe cqe, *rd_cqe; + struct t3_wq *wq; + u32 credit = 0; + u8 cqe_flushed; + u64 cookie; + int ret = 1; + + rd_cqe = cxio_next_cqe(&chp->cq); + + if (!rd_cqe) + return 0; + + qhp = get_qhp(rhp, CQE_QPID(*rd_cqe)); + if (!qhp) + wq = NULL; + else { + mtx_lock(&qhp->lock); + wq = &(qhp->wq); + } + ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, + &credit); + if (t3a_device(chp->rhp) && credit) { + CTR3(KTR_IW_CXGB, "%s updating %d cq credits on id %d", __FUNCTION__, + credit, chp->cq.cqid); + cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit); + } + + if (ret) { + ret = -EAGAIN; + goto out; + } + ret = 1; + + wc->wr_id = cookie; + wc->qp = &qhp->ibqp; + wc->vendor_err = CQE_STATUS(cqe); + + CTR4(KTR_IW_CXGB, "iwch_poll_cq_one qpid 0x%x type %d opcode %d status 0x%x", + CQE_QPID(cqe), CQE_TYPE(cqe), + CQE_OPCODE(cqe), CQE_STATUS(cqe)); + CTR3(KTR_IW_CXGB, "wrid hi 0x%x lo 0x%x cookie 0x%llx", + CQE_WRID_HI(cqe), CQE_WRID_LOW(cqe), (unsigned long long) cookie); + + if (CQE_TYPE(cqe) == 0) { + if (!CQE_STATUS(cqe)) + wc->byte_len = CQE_LEN(cqe); + else + wc->byte_len = 0; + wc->opcode = IB_WC_RECV; + } else { + switch (CQE_OPCODE(cqe)) { + case T3_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case T3_READ_REQ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = CQE_LEN(cqe); + break; + case T3_SEND: + case T3_SEND_WITH_SE: + wc->opcode = IB_WC_SEND; + break; + case T3_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + + /* these aren't supported yet */ + case T3_SEND_WITH_INV: + case T3_SEND_WITH_SE_INV: + case T3_LOCAL_INV: + case T3_FAST_REGISTER: + default: + log(LOG_ERR, "Unexpected opcode %d " + "in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + goto out; + } + } + + if (cqe_flushed) + wc->status = IB_WC_WR_FLUSH_ERR; + else { + + switch (CQE_STATUS(cqe)) { + case TPT_ERR_SUCCESS: + wc->status = IB_WC_SUCCESS; + break; + case TPT_ERR_STAG: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_PDID: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_WRAP: + wc->status = IB_WC_GENERAL_ERR; + break; + case TPT_ERR_BOUND: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + wc->status = IB_WC_MW_BIND_ERR; + break; + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + case TPT_ERR_OPCODE: + wc->status = IB_WC_FATAL_ERR; + break; + case TPT_ERR_SWFLUSH: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + default: + log(LOG_ERR, "Unexpected cqe_status 0x%x for " + "QPID=0x%0x\n", CQE_STATUS(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + } + } +out: + if (wq) + mtx_unlock(&qhp->lock); + return ret; +} + +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + int npolled; + int err = 0; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + + mtx_lock(&chp->lock); + for (npolled = 0; npolled < num_entries; ++npolled) { +#ifdef DEBUG + int i=0; +#endif + + /* + * Because T3 can post CQEs that are _not_ associated + * with a WR, we might have to poll again after removing + * one of these. + */ + do { + err = iwch_poll_cq_one(rhp, chp, wc + npolled); +#ifdef DEBUG + PANIC_IF(++i > 1000); +#endif + } while (err == -EAGAIN); + if (err <= 0) + break; + } + mtx_unlock(&chp->lock); + + if (err < 0) { + return err; + } else { + return npolled; + } +} + diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c new file mode 100644 index 0000000000000..8b52119e306ec --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_dbg.c @@ -0,0 +1,255 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/libkern.h> + +#include <netinet/in.h> + +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/ib_umem.h> +#include <contrib/rdma/ib_user_verbs.h> + +#ifdef DEBUG +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#include <ulp/iw_cxgb/iw_cxgb_resource.h> +#include <ulp/iw_cxgb/iw_cxgb_user.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h> +#endif + +void cxio_dump_tpt(struct cxio_rdev *rdev, uint32_t stag) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size = 32; + + m = kmalloc(sizeof(*m) + size, M_NOWAIT); + if (!m) { + CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base; + m->len = size; + CTR3(KTR_IW_CXGB, "%s TPT addr 0x%x len %d", __FUNCTION__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc); + free(m, M_DEVBUF); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + CTR2(KTR_IW_CXGB, "TPT %08x: %016llx", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + free(m, M_DEVBUF); +} + +void cxio_dump_pbl(struct cxio_rdev *rdev, uint32_t pbl_addr, uint32_t len, u8 shift) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size, npages; + + shift += 12; + npages = (len + (1ULL << shift) - 1) >> shift; + size = npages * sizeof(u64); + + m = kmalloc(sizeof(*m) + size, M_NOWAIT); + if (!m) { + CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = pbl_addr; + m->len = size; + CTR4(KTR_IW_CXGB, "%s PBL addr 0x%x len %d depth %d", + __FUNCTION__, m->addr, m->len, npages); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc); + free(m, M_DEVBUF); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + CTR2(KTR_IW_CXGB, "PBL %08x: %016llx", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + free(m, M_DEVBUF); +} + +void cxio_dump_wqe(union t3_wr *wqe) +{ + uint64_t *data = (uint64_t *)wqe; + uint32_t size = (uint32_t)(be64toh(*data) & 0xff); + + if (size == 0) + size = 8; + while (size > 0) { + CTR2(KTR_IW_CXGB, "WQE %p: %016llx", data, + (unsigned long long) be64toh(*data)); + size--; + data++; + } +} + +void cxio_dump_wce(struct t3_cqe *wce) +{ + uint64_t *data = (uint64_t *)wce; + int size = sizeof(*wce); + + while (size > 0) { + CTR2(KTR_IW_CXGB, "WCE %p: %016llx", data, + (unsigned long long) be64toh(*data)); + size -= 8; + data++; + } +} + +void cxio_dump_rqt(struct cxio_rdev *rdev, uint32_t hwtid, int nents) +{ + struct ch_mem_range *m; + int size = nents * 64; + u64 *data; + int rc; + + m = kmalloc(sizeof(*m) + size, M_NOWAIT); + if (!m) { + CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base; + m->len = size; + CTR3(KTR_IW_CXGB, "%s RQT addr 0x%x len %d", __FUNCTION__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc); + free(m, M_DEVBUF); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + CTR2(KTR_IW_CXGB, "RQT %08x: %016llx", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + free(m, M_DEVBUF); +} + +void cxio_dump_tcb(struct cxio_rdev *rdev, uint32_t hwtid) +{ + struct ch_mem_range *m; + int size = TCB_SIZE; + uint32_t *data; + int rc; + + m = kmalloc(sizeof(*m) + size, M_NOWAIT); + if (!m) { + CTR1(KTR_IW_CXGB, "%s couldn't allocate memory.", __FUNCTION__); + return; + } + m->mem_id = MEM_CM; + m->addr = hwtid * size; + m->len = size; + CTR3(KTR_IW_CXGB, "%s TCB %d len %d", __FUNCTION__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + CTR2(KTR_IW_CXGB, "%s toectl returned error %d", __FUNCTION__, rc); + free(m, M_DEVBUF); + return; + } + + data = (uint32_t *)m->buf; + while (size > 0) { + printf("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n", + m->addr, + *(data+2), *(data+3), *(data),*(data+1), + *(data+6), *(data+7), *(data+4), *(data+5)); + size -= 32; + data += 8; + m->addr += 32; + } + free(m, M_DEVBUF); +} +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c new file mode 100644 index 0000000000000..2e8154731133d --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_ev.c @@ -0,0 +1,265 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/libkern.h> + +#include <netinet/in.h> + +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/ib_umem.h> +#include <contrib/rdma/ib_user_verbs.h> + + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#include <ulp/iw_cxgb/iw_cxgb_resource.h> +#include <ulp/iw_cxgb/iw_cxgb_user.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h> +#endif + +static void +post_qp_event(struct iwch_dev *rnicp, struct iwch_qp *qhp, struct iwch_cq *chp, + struct respQ_msg_t *rsp_msg, + enum ib_event_type ib_event, + int send_term) +{ + struct ib_event event; + struct iwch_qp_attributes attrs; + + if ((qhp->attr.state == IWCH_QP_STATE_ERROR) || + (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) { + CTR4(KTR_IW_CXGB, "%s AE received after RTS - " + "qp state %d qpid 0x%x status 0x%x", __FUNCTION__, + qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe)); + return; + } + + log(LOG_ERR, "%s - AE qpid 0x%x opcode %d status 0x%x " + "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__, + CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe), + CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + + + event.event = ib_event; + event.device = chp->ibcq.device; + if (ib_event == IB_EVENT_CQ_ERR) + event.element.cq = &chp->ibcq; + else + event.element.qp = &qhp->ibqp; + + if (qhp->ibqp.event_handler) + (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); + + if (qhp->attr.state == IWCH_QP_STATE_RTS) { + attrs.next_state = IWCH_QP_STATE_TERMINATE; + iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (send_term) + iwch_post_terminate(qhp, rsp_msg); + } +} + +void +iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct mbuf *m) +{ + struct iwch_dev *rnicp; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) m->m_data; + struct iwch_cq *chp; + struct iwch_qp *qhp; + u32 cqid = RSPQ_CQID(rsp_msg); + + rnicp = (struct iwch_dev *) rdev_p->ulp; + mtx_lock(&rnicp->lock); + chp = get_chp(rnicp, cqid); + qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); + if (!chp || !qhp) { + log(LOG_ERR,"BAD AE cqid 0x%x qpid 0x%x opcode %d " + "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x \n", + cqid, CQE_QPID(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), + CQE_WRID_LOW(rsp_msg->cqe)); + mtx_unlock(&rnicp->lock); + goto out; + } + iwch_qp_add_ref(&qhp->ibqp); + mtx_lock(&chp->lock); + ++chp->refcnt; + mtx_unlock(&chp->lock); + mtx_unlock(&rnicp->lock); + + /* + * 1) completion of our sending a TERMINATE. + * 2) incoming TERMINATE message. + */ + if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) && + (CQE_STATUS(rsp_msg->cqe) == 0)) { + if (SQ_TYPE(rsp_msg->cqe)) { + CTR3(KTR_IW_CXGB, "%s QPID 0x%x ep %p disconnecting", + __FUNCTION__, qhp->wq.qpid, qhp->ep); + iwch_ep_disconnect(qhp->ep, 0, M_NOWAIT); + } else { + CTR2(KTR_IW_CXGB, "%s post REQ_ERR AE QPID 0x%x", __FUNCTION__, + qhp->wq.qpid); + post_qp_event(rnicp, qhp, chp, rsp_msg, + IB_EVENT_QP_REQ_ERR, 0); + iwch_ep_disconnect(qhp->ep, 0, M_NOWAIT); + } + goto done; + } + + /* Bad incoming Read request */ + if (SQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) { + post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + /* Bad incoming write */ + if (RQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) { + post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + switch (CQE_STATUS(rsp_msg->cqe)) { + + /* Completion Events */ + case TPT_ERR_SUCCESS: +#if 0 + /* + * Confirm the destination entry if this is a RECV completion. + */ + if (qhp->ep && SQ_TYPE(rsp_msg->cqe)) + dst_confirm(qhp->ep->dst); +#endif + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + break; + + case TPT_ERR_STAG: + case TPT_ERR_PDID: + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + case TPT_ERR_WRAP: + case TPT_ERR_BOUND: + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + log(LOG_ERR, "%s - CQE Err qpid 0x%x opcode %d status 0x%x " + "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__, + CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe), + CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1); + break; + + /* Device Fatal Errors */ + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1); + break; + + /* QP Fatal Errors */ + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_PBL_ADDR_BOUND: + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_OPCODE: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_RQE_ADDR_BOUND: + case TPT_ERR_IRD_OVERFLOW: + post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + + default: + log(LOG_ERR,"Unknown T3 status 0x%x QPID 0x%x\n", + CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid); + post_qp_event(rnicp, qhp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + } +done: + mtx_lock(&chp->lock); + if (--chp->refcnt == 0) + wakeup(chp); + mtx_unlock(&chp->lock); + iwch_qp_rem_ref(&qhp->ibqp); +out: + m_free(m); +} diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c new file mode 100644 index 0000000000000..0309b53ba3c03 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.c @@ -0,0 +1,1418 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/queue.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <netinet/in.h> + +#include <contrib/rdma/ib_verbs.h> + + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#include <ulp/iw_cxgb/iw_cxgb_resource.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h> +#endif + +static TAILQ_HEAD( ,cxio_rdev) rdev_list; +static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL; + +static struct cxio_rdev * +cxio_hal_find_rdev_by_name(char *dev_name) +{ + struct cxio_rdev *rdev; + + TAILQ_FOREACH(rdev, &rdev_list, entry) + if (!strcmp(rdev->dev_name, dev_name)) + return rdev; + return NULL; +} + +struct cxio_rdev * +cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev) +{ + struct cxio_rdev *rdev; + + TAILQ_FOREACH(rdev, &rdev_list, entry) + if (rdev->t3cdev_p == tdev) + return rdev; + return NULL; +} + +int +cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit) +{ + int ret; + struct t3_cqe *cqe; + u32 rptr; + + struct rdma_cq_op setup; + setup.id = cq->cqid; + setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0; + setup.op = op; + ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup); + + if ((ret < 0) || (op == CQ_CREDIT_UPDATE)) + return (ret); + + /* + * If the rearm returned an index other than our current index, + * then there might be CQE's in flight (being DMA'd). We must wait + * here for them to complete or the consumer can miss a notification. + */ + if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) { + int i=0; + + rptr = cq->rptr; + + /* + * Keep the generation correct by bumping rptr until it + * matches the index returned by the rearm - 1. + */ + while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret) + rptr++; + + /* + * Now rptr is the index for the (last) cqe that was + * in-flight at the time the HW rearmed the CQ. We + * spin until that CQE is valid. + */ + cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2); + while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) { + DELAY(1); + if (i++ > 1000000) { + PANIC_IF(1); + log(LOG_ERR, "%s: stalled rnic\n", + rdev_p->dev_name); + return (-EIO); + } + } + + return 1; + } + + return 0; +} + +static int +cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid) +{ + struct rdma_cq_setup setup; + setup.id = cqid; + setup.base_addr = 0; /* NULL address */ + setup.size = 0; /* disaable the CQ */ + setup.credits = 0; + setup.credit_thres = 0; + setup.ovfl_mode = 0; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static int +cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid) +{ + u64 sge_cmd; + struct t3_modify_qp_wr *wqe; + struct mbuf *m = m_gethdr(MT_DATA, M_NOWAIT); + if (m == NULL) { + CTR1(KTR_IW_CXGB, "%s m_gethdr failed", __FUNCTION__); + return (-ENOMEM); + } + wqe = mtod(m, struct t3_modify_qp_wr *); + m->m_len = m->m_pkthdr.len = sizeof(*wqe); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0, qpid, 7); + wqe->flags = htobe32(MODQP_WRITE_EC); + sge_cmd = qpid << 8 | 3; + wqe->sge_cmd = htobe64(sge_cmd); + m_set_priority(m, CPL_PRIORITY_CONTROL); + m_set_sgl(m, NULL); + m_set_sgllen(m, 0); + return (cxgb_ofld_send(rdev_p->t3cdev_p, m)); +} + +int +cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + struct rdma_cq_setup setup; + int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe); + + cq->cqid = cxio_hal_get_cqid(rdev_p->rscp); + if (!cq->cqid) + return (-ENOMEM); + cq->sw_queue = malloc(size, M_DEVBUF, M_NOWAIT|M_ZERO); + if (!cq->sw_queue) + return (-ENOMEM); +#if 0 + cq->queue = dma_alloc_coherent(rdev_p->rnic_info.pdev, + (1UL << (cq->size_log2)) * + sizeof(struct t3_cqe), + &(cq->dma_addr), M_NOWAIT); +#else + cq->queue = contigmalloc((1UL << (cq->size_log2))*sizeof(struct t3_cqe), + M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0); + if (cq->queue) + cq->dma_addr = vtophys(cq->queue); + else { + free(cq->sw_queue, M_DEVBUF); + return (-ENOMEM); + } +#endif + +#ifdef notyet + pci_unmap_addr_set(cq, mapping, cq->dma_addr); +#endif + memset(cq->queue, 0, size); + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = 65535; + setup.credit_thres = 1; + if (rdev_p->t3cdev_p->type != T3A) + setup.ovfl_mode = 0; + else + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +int +cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + struct rdma_cq_setup setup; + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = setup.size; + setup.credit_thres = setup.size; /* TBD: overflow recovery */ + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static u32 +get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct cxio_qpid *entry; + u32 qpid; + int i; + + mtx_lock(&uctx->lock); + if (!TAILQ_EMPTY(&uctx->qpids)) { + + entry = TAILQ_FIRST(&uctx->qpids); + TAILQ_REMOVE(&uctx->qpids, entry, entry); + qpid = entry->qpid; + free(entry, M_DEVBUF); + } else { + qpid = cxio_hal_get_qpid(rdev_p->rscp); + if (!qpid) + goto out; + for (i = qpid+1; i & rdev_p->qpmask; i++) { + entry = malloc(sizeof *entry, M_DEVBUF, M_NOWAIT); + if (!entry) + break; + entry->qpid = i; + TAILQ_INSERT_TAIL(&uctx->qpids, entry, entry); + } + } +out: + mtx_unlock(&uctx->lock); + CTR2(KTR_IW_CXGB, "%s qpid 0x%x", __FUNCTION__, qpid); + return qpid; +} + +static void +put_qpid(struct cxio_rdev *rdev_p, u32 qpid, + struct cxio_ucontext *uctx) +{ + struct cxio_qpid *entry; + + entry = malloc(sizeof *entry, M_DEVBUF, M_NOWAIT); + CTR2(KTR_IW_CXGB, "%s qpid 0x%x", __FUNCTION__, qpid); + entry->qpid = qpid; + mtx_lock(&uctx->lock); + TAILQ_INSERT_TAIL(&uctx->qpids, entry, entry); + mtx_unlock(&uctx->lock); +} + +void +cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct cxio_qpid *pos, *tmp; + + mtx_lock(&uctx->lock); + TAILQ_FOREACH_SAFE(pos, &uctx->qpids, entry, tmp) { + TAILQ_REMOVE(&uctx->qpids, pos, entry); + if (!(pos->qpid & rdev_p->qpmask)) + cxio_hal_put_qpid(rdev_p->rscp, pos->qpid); + free(pos, M_DEVBUF); + } + mtx_unlock(&uctx->lock); +} + +void +cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + TAILQ_INIT(&uctx->qpids); + mtx_init(&uctx->lock, "cxio uctx", NULL, MTX_DEF|MTX_DUPOK); +} + +int +cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain, + struct t3_wq *wq, struct cxio_ucontext *uctx) +{ + int depth = 1UL << wq->size_log2; + int rqsize = 1UL << wq->rq_size_log2; + + wq->qpid = get_qpid(rdev_p, uctx); + if (!wq->qpid) + return (-ENOMEM); + + wq->rq = malloc(depth * sizeof(u64), M_DEVBUF, M_NOWAIT|M_ZERO); + if (!wq->rq) + goto err1; + + wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize); + if (!wq->rq_addr) + goto err2; + + wq->sq = malloc(depth * sizeof(struct t3_swsq), M_DEVBUF, M_NOWAIT|M_ZERO); + if (!wq->sq) + goto err3; +#if 0 + wq->queue = dma_alloc_coherent(rdev_p->rnic_info.pdev, + depth * sizeof(union t3_wr), + &(wq->dma_addr), M_NOWAIT); +#else + wq->queue = contigmalloc(depth *sizeof(union t3_wr), + M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0); + if (wq->queue) + wq->dma_addr = vtophys(wq->queue); + +#endif + if (!wq->queue) + goto err4; + + memset(wq->queue, 0, depth * sizeof(union t3_wr)); +#ifdef notyet + pci_unmap_addr_set(wq, mapping, wq->dma_addr); +#endif + wq->doorbell = rdev_p->rnic_info.kdb_addr; + if (!kernel_domain) + wq->udb = (u64)rdev_p->rnic_info.udbell_physbase + + (wq->qpid << rdev_p->qpshift); + CTR4(KTR_IW_CXGB, "%s qpid 0x%x doorbell 0x%p udb 0x%llx", __FUNCTION__, + wq->qpid, wq->doorbell, (unsigned long long) wq->udb); + return 0; +err4: + free(wq->sq, M_DEVBUF); +err3: + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize); +err2: + free(wq->rq, M_DEVBUF); +err1: + put_qpid(rdev_p, wq->qpid, uctx); + return (-ENOMEM); +} + +int +cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + int err; + err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid); + free(cq->sw_queue, M_DEVBUF); +#if 0 + dma_free_coherent(&(rdev_p->rnic_info.pdev), + (1UL << (cq->size_log2)) + * sizeof(struct t3_cqe), cq->queue, + /* pci_unmap_addr(cq, mapping)*/ 0); +#else + contigfree(cq->queue,(1UL << (cq->size_log2)) + * sizeof(struct t3_cqe), M_DEVBUF); +#endif + cxio_hal_put_cqid(rdev_p->rscp, cq->cqid); + return err; +} + +int +cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq, + struct cxio_ucontext *uctx) +{ + +#if 0 + dma_free_coherent(&(rdev_p->rnic_info.pdev), + (1UL << (wq->size_log2)) + * sizeof(union t3_wr), wq->queue, + /* pci_unmap_addr(wq, mapping)*/ 0); +#else + contigfree(wq->queue, (1UL << (wq->size_log2)) + * sizeof(union t3_wr), M_DEVBUF); +#endif + free(wq->sq, M_DEVBUF); + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2)); + free(wq->rq, M_DEVBUF); + put_qpid(rdev_p, wq->qpid, uctx); + return 0; +} + +static void +insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_cqe cqe; + + CTR5(KTR_IW_CXGB, "%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x", __FUNCTION__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = htobe32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(T3_SEND) | + V_CQE_TYPE(0) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +void +cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + u32 ptr; + + CTR3(KTR_IW_CXGB, "%s wq %p cq %p", __FUNCTION__, wq, cq); + + /* flush RQ */ + CTR4(KTR_IW_CXGB, "%s rq_rptr %u rq_wptr %u skip count %u", __FUNCTION__, + wq->rq_rptr, wq->rq_wptr, count); + ptr = wq->rq_rptr + count; + while (ptr++ != wq->rq_wptr) + insert_recv_cqe(wq, cq); +} + +static void +insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq, + struct t3_swsq *sqp) +{ + struct t3_cqe cqe; + + CTR5(KTR_IW_CXGB, "%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x", __FUNCTION__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = htobe32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(sqp->opcode) | + V_CQE_TYPE(1) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + cqe.u.scqe.wrid_hi = sqp->sq_wptr; + + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +void +cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + __u32 ptr; + struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2); + + ptr = wq->sq_rptr + count; + sqp += count; + while (ptr != wq->sq_wptr) { + insert_sq_cqe(wq, cq, sqp); + sqp++; + ptr++; + } +} + +/* + * Move all CQEs from the HWCQ into the SWCQ. + */ +void +cxio_flush_hw_cq(struct t3_cq *cq) +{ + struct t3_cqe *cqe, *swcqe; + + CTR3(KTR_IW_CXGB, "%s cq %p cqid 0x%x", __FUNCTION__, cq, cq->cqid); + cqe = cxio_next_hw_cqe(cq); + while (cqe) { + CTR3(KTR_IW_CXGB, "%s flushing hwcq rptr 0x%x to swcq wptr 0x%x", + __FUNCTION__, cq->rptr, cq->sw_wptr); + swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2); + *swcqe = *cqe; + swcqe->header |= htobe32(V_CQE_SWCQE(1)); + cq->sw_wptr++; + cq->rptr++; + cqe = cxio_next_hw_cqe(cq); + } +} + +static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq) +{ + if (CQE_OPCODE(*cqe) == T3_TERMINATE) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe)) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe)) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_SEND) && RQ_TYPE(*cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) + return 0; + + return 1; +} + +void +cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if ((SQ_TYPE(*cqe) || (CQE_OPCODE(*cqe) == T3_READ_RESP)) && + (CQE_QPID(*cqe) == wq->qpid)) + (*count)++; + ptr++; + } + CTR3(KTR_IW_CXGB, "%s cq %p count %d", __FUNCTION__, cq, *count); +} + +void +cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + CTR2(KTR_IW_CXGB, "%s count zero %d", __FUNCTION__, *count); + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) && + (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq)) + (*count)++; + ptr++; + } + CTR3(KTR_IW_CXGB, "%s cq %p count %d", __FUNCTION__, cq, *count); +} + +static int +cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p) +{ + struct rdma_cq_setup setup; + setup.id = 0; + setup.base_addr = 0; /* NULL address */ + setup.size = 1; /* enable the CQ */ + setup.credits = 0; + + /* force SGE to redirect to RspQ and interrupt */ + setup.credit_thres = 0; + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static int +cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) +{ + int err; + u64 sge_cmd, ctx0, ctx1; + u64 base_addr; + struct t3_modify_qp_wr *wqe; + struct mbuf *m; + + m = m_gethdr(MT_DATA, M_NOWAIT); + if (m == NULL) { + CTR1(KTR_IW_CXGB, "%s m_gethdr failed", __FUNCTION__); + return (-ENOMEM); + } + err = cxio_hal_init_ctrl_cq(rdev_p); + if (err) { + CTR2(KTR_IW_CXGB, "%s err %d initializing ctrl_cq", __FUNCTION__, err); + goto err; + } +#if 0 + rdev_p->ctrl_qp.workq = dma_alloc_coherent( + rdev_p->rnic_info.pdev, + (1 << T3_CTRL_QP_SIZE_LOG2) * + sizeof(union t3_wr), + &(rdev_p->ctrl_qp.dma_addr), + M_NOWAIT); +#else + rdev_p->ctrl_qp.workq = contigmalloc((1 << T3_CTRL_QP_SIZE_LOG2) + *sizeof(union t3_wr), M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0); + if (rdev_p->ctrl_qp.workq) + rdev_p->ctrl_qp.dma_addr = vtophys(rdev_p->ctrl_qp.workq); + +#endif + + if (!rdev_p->ctrl_qp.workq) { + CTR1(KTR_IW_CXGB, "%s dma_alloc_coherent failed", __FUNCTION__); + err = -ENOMEM; + goto err; + } +#if 0 + pci_unmap_addr_set(&rdev_p->ctrl_qp, mapping, + rdev_p->ctrl_qp.dma_addr); +#endif + rdev_p->ctrl_qp.doorbell = (void /*__iomem */ *)rdev_p->rnic_info.kdb_addr; + memset(rdev_p->ctrl_qp.workq, 0, + (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr)); + + mtx_init(&rdev_p->ctrl_qp.lock, "ctl-qp lock", NULL, MTX_DEF|MTX_DUPOK); + + /* update HW Ctrl QP context */ + base_addr = rdev_p->ctrl_qp.dma_addr; + base_addr >>= 12; + ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) | + V_EC_BASE_LO((u32) base_addr & 0xffff)); + ctx0 <<= 32; + ctx0 |= V_EC_CREDITS(FW_WR_NUM); + base_addr >>= 16; + ctx1 = (u32) base_addr; + base_addr >>= 32; + ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) | + V_EC_TYPE(0) | V_EC_GEN(1) | + V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32; + wqe = mtod(m, struct t3_modify_qp_wr *); + m->m_len = m->m_pkthdr.len = sizeof(*wqe); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 0, + T3_CTL_QP_TID, 7); + wqe->flags = htobe32(MODQP_WRITE_EC); + sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3; + wqe->sge_cmd = htobe64(sge_cmd); + wqe->ctx1 = htobe64(ctx1); + wqe->ctx0 = htobe64(ctx0); + CTR3(KTR_IW_CXGB, "CtrlQP dma_addr 0x%llx workq %p size %d", + (unsigned long long) rdev_p->ctrl_qp.dma_addr, + rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2); + m_set_priority(m, CPL_PRIORITY_CONTROL); + m_set_sgl(m, NULL); + m_set_sgllen(m, 0); + return (cxgb_ofld_send(rdev_p->t3cdev_p, m)); +err: + m_free(m); + return err; +} + +static int +cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p) +{ +#if 0 + + dma_free_coherent(&(rdev_p->rnic_info.pdev), + (1UL << T3_CTRL_QP_SIZE_LOG2) + * sizeof(union t3_wr), rdev_p->ctrl_qp.workq, + /* pci_unmap_addr(&rdev_p->ctrl_qp, mapping)*/ 0); +#else + contigfree(rdev_p->ctrl_qp.workq,(1UL << T3_CTRL_QP_SIZE_LOG2) + * sizeof(union t3_wr), M_DEVBUF); +#endif + return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID); +} + +/* write len bytes of data into addr (32B aligned address) + * If data is NULL, clear len byte of memory to zero. + * caller aquires the ctrl_qp lock before the call + */ +static int +cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, + u32 len, void *data, int completion) +{ + u32 i, nr_wqe, copy_len; + u8 *copy_data; + u8 wr_len, utx_len; /* lenght in 8 byte flit */ + enum t3_wr_flags flag; + __be64 *wqe; + u64 utx_cmd; + addr &= 0x7FFFFFF; + nr_wqe = len % 96 ? len / 96 + 1 : len / 96; /* 96B max per WQE */ + CTR6(KTR_IW_CXGB, "cxio_hal_ctrl_qp_write_mem wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x", + rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len, + nr_wqe, data, addr); + utx_len = 3; /* in 32B unit */ + for (i = 0; i < nr_wqe; i++) { + if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2)) { + CTR4(KTR_IW_CXGB, "%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, " + "wait for more space i %d", __FUNCTION__, + rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i); + if (cxio_wait(&rdev_p->ctrl_qp, + &rdev_p->ctrl_qp.lock, + !Q_FULL(rdev_p->ctrl_qp.rptr, + rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2))) { + CTR1(KTR_IW_CXGB, "%s ctrl_qp workq interrupted", + __FUNCTION__); + return (-ERESTART); + } + CTR2(KTR_IW_CXGB, "%s ctrl_qp wakeup, continue posting work request " + "i %d", __FUNCTION__, i); + } + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + flag = 0; + if (i == (nr_wqe - 1)) { + /* last WQE */ + flag = completion ? T3_COMPLETION_FLAG : 0; + if (len % 32) + utx_len = len / 32 + 1; + else + utx_len = len / 32; + } + + /* + * Force a CQE to return the credit to the workq in case + * we posted more than half the max QP size of WRs + */ + if ((i != 0) && + (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) { + flag = T3_COMPLETION_FLAG; + CTR2(KTR_IW_CXGB, "%s force completion at i %d", __FUNCTION__, i); + } + + /* build the utx mem command */ + wqe += (sizeof(struct t3_bypass_wr) >> 3); + utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3); + utx_cmd <<= 32; + utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1); + *wqe = htobe64(utx_cmd); + wqe++; + copy_data = (u8 *) data + i * 96; + copy_len = len > 96 ? 96 : len; + + /* clear memory content if data is NULL */ + if (data) + memcpy(wqe, copy_data, copy_len); + else + memset(wqe, 0, copy_len); + if (copy_len % 32) + memset(((u8 *) wqe) + copy_len, 0, + 32 - (copy_len % 32)); + wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 + + (utx_len << 2); + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + + /* wptr in the WRID[31:0] */ + ((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr; + + /* + * This must be the last write with a memory barrier + * for the genbit + */ + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag, + Q_GENBIT(rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID, + wr_len); + if (flag == T3_COMPLETION_FLAG) + ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID); + + len -= 96; + rdev_p->ctrl_qp.wptr++; + } + return 0; +} + +/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl, and pbl_size + * OUT: stag index, actual pbl_size, pbl_addr allocated. + * TBD: shared memory region support + */ +static int +__cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, + u32 *stag, u8 stag_state, u32 pdid, + enum tpt_mem_type type, enum tpt_mem_perm perm, + u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl, + u32 *pbl_size, u32 *pbl_addr) +{ + int err; + struct tpt_entry tpt; + u32 stag_idx; + u32 wptr; + int rereg = (*stag != T3_STAG_UNSET); + + stag_state = stag_state > 0; + stag_idx = (*stag) >> 8; + + if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) { + stag_idx = cxio_hal_get_stag(rdev_p->rscp); + if (!stag_idx) + return (-ENOMEM); + *stag = (stag_idx << 8) | ((*stag) & 0xFF); + } + CTR5(KTR_IW_CXGB, "%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x", + __FUNCTION__, stag_state, type, pdid, stag_idx); + + if (reset_tpt_entry) + cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3); + else if (!rereg) { + *pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3); + if (!*pbl_addr) { + return (-ENOMEM); + } + } + + mtx_lock(&rdev_p->ctrl_qp.lock); + + /* write PBL first if any - update pbl only if pbl list exist */ + if (pbl) { + + CTR4(KTR_IW_CXGB, "%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d", + __FUNCTION__, *pbl_addr, rdev_p->rnic_info.pbl_base, + *pbl_size); + err = cxio_hal_ctrl_qp_write_mem(rdev_p, + (*pbl_addr >> 5), + (*pbl_size << 3), pbl, 0); + if (err) + goto ret; + } + + /* write TPT entry */ + if (reset_tpt_entry) + memset(&tpt, 0, sizeof(tpt)); + else { + tpt.valid_stag_pdid = htobe32(F_TPT_VALID | + V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) | + V_TPT_STAG_STATE(stag_state) | + V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid)); + PANIC_IF(page_size >= 28); + tpt.flags_pagesize_qpid = htobe32(V_TPT_PERM(perm) | + F_TPT_MW_BIND_ENABLE | + V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | + V_TPT_PAGE_SIZE(page_size)); + tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : + htobe32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3)); + tpt.len = htobe32(len); + tpt.va_hi = htobe32((u32) (to >> 32)); + tpt.va_low_or_fbo = htobe32((u32) (to & 0xFFFFFFFFULL)); + tpt.rsvd_bind_cnt_or_pstag = 0; + tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : + htobe32(V_TPT_PBL_SIZE((*pbl_size) >> 2)); + } + err = cxio_hal_ctrl_qp_write_mem(rdev_p, + stag_idx + + (rdev_p->rnic_info.tpt_base >> 5), + sizeof(tpt), &tpt, 1); + + /* release the stag index to free pool */ + if (reset_tpt_entry) + cxio_hal_put_stag(rdev_p->rscp, stag_idx); +ret: + wptr = rdev_p->ctrl_qp.wptr; + mtx_unlock(&rdev_p->ctrl_qp.lock); + if (!err) + if (cxio_wait(&rdev_p->ctrl_qp, + &rdev_p->ctrl_qp.lock, + SEQ32_GE(rdev_p->ctrl_qp.rptr, wptr))) + return (-ERESTART); + return err; +} + +int +cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr) +{ + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl, pbl_size, pbl_addr); +} + +int +cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl, pbl_size, pbl_addr); +} + +int +cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size, + u32 pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, + &pbl_size, &pbl_addr); +} + +int +cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid) +{ + u32 pbl_size = 0; + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0, + NULL, &pbl_size, NULL); +} + +int +cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, + NULL, NULL); +} + +int +cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) +{ + struct t3_rdma_init_wr *wqe; + struct mbuf *m = m_gethdr(MT_DATA, M_NOWAIT); + if (m == NULL) + return (-ENOMEM); + CTR2(KTR_IW_CXGB, "%s rdev_p %p", __FUNCTION__, rdev_p); + wqe = mtod(m, struct t3_rdma_init_wr *); + m->m_len = m->m_pkthdr.len = sizeof(*wqe); + wqe->wrh.op_seop_flags = htobe32(V_FW_RIWR_OP(T3_WR_INIT)); + wqe->wrh.gen_tid_len = htobe32(V_FW_RIWR_TID(attr->tid) | + V_FW_RIWR_LEN(sizeof(*wqe) >> 3)); + wqe->wrid.id1 = 0; + wqe->qpid = htobe32(attr->qpid); + wqe->pdid = htobe32(attr->pdid); + wqe->scqid = htobe32(attr->scqid); + wqe->rcqid = htobe32(attr->rcqid); + wqe->rq_addr = htobe32(attr->rq_addr - rdev_p->rnic_info.rqt_base); + wqe->rq_size = htobe32(attr->rq_size); + wqe->mpaattrs = attr->mpaattrs; + wqe->qpcaps = attr->qpcaps; + wqe->ulpdu_size = htobe16(attr->tcp_emss); + wqe->flags = htobe32(attr->flags); + wqe->ord = htobe32(attr->ord); + wqe->ird = htobe32(attr->ird); + wqe->qp_dma_addr = htobe64(attr->qp_dma_addr); + wqe->qp_dma_size = htobe32(attr->qp_dma_size); + wqe->irs = htobe32(attr->irs); + m_set_priority(m, 0); /* 0=>ToeQ; 1=>CtrlQ */ + m_set_sgl(m, NULL); + m_set_sgllen(m, 0); + return (cxgb_ofld_send(rdev_p->t3cdev_p, m)); +} + +void +cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = ev_cb; +} + +void +cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = NULL; +} + +static int +cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct mbuf *m) +{ + static int cnt; + struct cxio_rdev *rdev_p = NULL; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) m->m_data; + + CTR6(KTR_IW_CXGB, "%s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x", + __FUNCTION__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg), + RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg)); + CTR4(KTR_IW_CXGB, "se %0x notify %0x cqbranch %0x creditth %0x", + RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg), + RSPQ_CREDIT_THRESH(rsp_msg)); + CTR4(KTR_IW_CXGB, "CQE: QPID 0x%0x type 0x%0x status 0x%0x opcode %d", + CQE_QPID(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe)); + CTR3(KTR_IW_CXGB, "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x", + CQE_LEN(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + rdev_p = (struct cxio_rdev *)t3cdev_p->ulp; + if (!rdev_p) { + CTR2(KTR_IW_CXGB, "%s called by t3cdev %p with null ulp", __FUNCTION__, + t3cdev_p); + return 0; + } + if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) { + mtx_lock(&rdev_p->ctrl_qp.lock); + rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1; + wakeup(&rdev_p->ctrl_qp); + mtx_unlock(&rdev_p->ctrl_qp.lock); + m_free(m); + } else if (CQE_QPID(rsp_msg->cqe) == 0xfff8) + m_free(m); + else if (cxio_ev_cb) + (*cxio_ev_cb) (rdev_p, m); + else + m_free(m); + cnt++; + return 0; +} + +/* Caller takes care of locking if needed */ +int +cxio_rdev_open(struct cxio_rdev *rdev_p) +{ + struct ifnet *ifp; + int err = 0; + + if (strlen(rdev_p->dev_name)) { + if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) { + return (-EBUSY); + } + ifp = rdev_p->ifp; + if (ifp == NULL) + return (-EINVAL); + if_free(ifp); + } else if (rdev_p->t3cdev_p) { + if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) + return (-EBUSY); + ifp = rdev_p->t3cdev_p->lldev; + strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name, + T3_MAX_DEV_NAME_LEN); + } else { + CTR1(KTR_IW_CXGB, "%s t3cdev_p or dev_name must be set", __FUNCTION__); + return (-EINVAL); + } + + TAILQ_INSERT_TAIL(&rdev_list, rdev_p, entry); + + CTR2(KTR_IW_CXGB, "%s opening rnic dev %s", __FUNCTION__, rdev_p->dev_name); + memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp)); + if (!rdev_p->t3cdev_p) + rdev_p->t3cdev_p = T3CDEV(ifp); + rdev_p->t3cdev_p->ulp = (void *) rdev_p; + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS, + &(rdev_p->rnic_info)); + if (err) { + log(LOG_ERR, "%s t3cdev_p(%p)->ctl returned error %d.\n", + __FUNCTION__, rdev_p->t3cdev_p, err); + goto err1; + } + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS, + &(rdev_p->port_info)); + if (err) { + log(LOG_ERR, "%s t3cdev_p(%p)->ctl returned error %d.\n", + __FUNCTION__, rdev_p->t3cdev_p, err); + goto err1; + } + + /* + * qpshift is the number of bits to shift the qpid left in order + * to get the correct address of the doorbell for that qp. + */ + cxio_init_ucontext(rdev_p, &rdev_p->uctx); + rdev_p->qpshift = PAGE_SHIFT - + ilog2(65536 >> + ilog2(rdev_p->rnic_info.udbell_len >> + PAGE_SHIFT)); + rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT; + rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1; + CTR4(KTR_IW_CXGB, "cxio_rdev_open rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d", + rdev_p->dev_name, rdev_p->rnic_info.tpt_base, + rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p)); + CTR4(KTR_IW_CXGB, "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x", + rdev_p->rnic_info.pbl_base, + rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base, + rdev_p->rnic_info.rqt_top); + CTR6(KTR_IW_CXGB, "udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu " + "qpnr %d qpmask 0x%x", + rdev_p->rnic_info.udbell_len, + rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr, + rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask); + + err = cxio_hal_init_ctrl_qp(rdev_p); + if (err) { + log(LOG_ERR, "%s error %d initializing ctrl_qp.\n", + __FUNCTION__, err); + goto err1; + } + err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0, + 0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ, + T3_MAX_NUM_PD); + if (err) { + log(LOG_ERR, "%s error %d initializing hal resources.\n", + __FUNCTION__, err); + goto err2; + } + err = cxio_hal_pblpool_create(rdev_p); + if (err) { + log(LOG_ERR, "%s error %d initializing pbl mem pool.\n", + __FUNCTION__, err); + goto err3; + } + err = cxio_hal_rqtpool_create(rdev_p); + if (err) { + log(LOG_ERR, "%s error %d initializing rqt mem pool.\n", + __FUNCTION__, err); + goto err4; + } + return 0; +err4: + cxio_hal_pblpool_destroy(rdev_p); +err3: + cxio_hal_destroy_resource(rdev_p->rscp); +err2: + cxio_hal_destroy_ctrl_qp(rdev_p); +err1: + TAILQ_REMOVE(&rdev_list, rdev_p, entry); + return err; +} + +void +cxio_rdev_close(struct cxio_rdev *rdev_p) +{ + if (rdev_p) { + cxio_hal_pblpool_destroy(rdev_p); + cxio_hal_rqtpool_destroy(rdev_p); + TAILQ_REMOVE(&rdev_list, rdev_p, entry); + rdev_p->t3cdev_p->ulp = NULL; + cxio_hal_destroy_ctrl_qp(rdev_p); + cxio_hal_destroy_resource(rdev_p->rscp); + } +} + +int +cxio_hal_init(void) +{ + TAILQ_INIT(&rdev_list); +#ifdef needed + if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI)) + return (-ENOMEM); +#endif + t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler); + return 0; +} + +void +cxio_hal_exit(void) +{ + struct cxio_rdev *rdev, *tmp; + + t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL); + TAILQ_FOREACH_SAFE(rdev, &rdev_list, entry, tmp) + cxio_rdev_close(rdev); +#ifdef needed + cxio_hal_destroy_rhdl_resource(); +#endif +} + +static void +flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_swsq *sqp; + __u32 ptr = wq->sq_rptr; + int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr); + + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + while (count--) + if (!sqp->signaled) { + ptr++; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + } else if (sqp->complete) { + + /* + * Insert this completed cqe into the swcq. + */ + CTR3(KTR_IW_CXGB, "%s moving cqe into swcq sq idx %ld cq idx %ld", + __FUNCTION__, Q_PTR2IDX(ptr, wq->sq_size_log2), + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)); + sqp->cqe.header |= htonl(V_CQE_SWCQE(1)); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) + = sqp->cqe; + cq->sw_wptr++; + sqp->signaled = 0; + break; + } else + break; +} + +static void +create_read_req_cqe(struct t3_wq *wq, struct t3_cqe *hw_cqe, + struct t3_cqe *read_cqe) +{ + read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr; + read_cqe->len = wq->oldest_read->read_len; + read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) | + V_CQE_SWCQE(SW_CQE(*hw_cqe)) | + V_CQE_OPCODE(T3_READ_REQ) | + V_CQE_TYPE(1)); +} + +/* + * Return a ptr to the next read wr in the SWSQ or NULL. + */ +static void +advance_oldest_read(struct t3_wq *wq) +{ + + u32 rptr = wq->oldest_read - wq->sq + 1; + u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2); + + while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) { + wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2); + + if (wq->oldest_read->opcode == T3_READ_REQ) + return; + rptr++; + } + wq->oldest_read = NULL; +} + +/* + * cxio_poll_cq + * + * Caller must: + * check the validity of the first CQE, + * supply the wq assicated with the qpid. + * + * credit: cq credit to return to sge. + * cqe_flushed: 1 iff the CQE is flushed. + * cqe: copy of the polled CQE. + * + * return value: + * 0 CQE returned, + * -1 CQE skipped, try again. + */ +int +cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit) +{ + int ret = 0; + struct t3_cqe *hw_cqe, read_cqe; + + *cqe_flushed = 0; + *credit = 0; + hw_cqe = cxio_next_cqe(cq); + + CTR5(KTR_IW_CXGB, "cxio_poll_cq CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x", + CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe), + CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe)); + CTR4(KTR_IW_CXGB, "opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x", + CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe), + CQE_WRID_LOW(*hw_cqe)); + + /* + * skip cqe's not affiliated with a QP. + */ + if (wq == NULL) { + ret = -1; + goto skip_cqe; + } + + /* + * Gotta tweak READ completions: + * 1) the cqe doesn't contain the sq_wptr from the wr. + * 2) opcode not reflected from the wr. + * 3) read_len not reflected from the wr. + * 4) cq_type is RQ_TYPE not SQ_TYPE. + */ + if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) { + + /* + * Don't write to the HWCQ, so create a new read req CQE + * in local memory. + */ + create_read_req_cqe(wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(wq); + } + + /* + * T3A: Discard TERMINATE CQEs. + */ + if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) { + ret = -1; + wq->error = 1; + goto skip_cqe; + } + + if (CQE_STATUS(*hw_cqe) || wq->error) { + *cqe_flushed = wq->error; + wq->error = 1; + + /* + * T3A inserts errors into the CQE. We cannot return + * these as work completions. + */ + /* incoming write failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE) + && RQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + /* incoming read request failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + + /* incoming SEND with no receive posted failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_SEND) && RQ_TYPE(*hw_cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) { + ret = -1; + goto skip_cqe; + } + goto proc_cqe; + } + + /* + * RECV completion. + */ + if (RQ_TYPE(*hw_cqe)) { + + /* + * HW only validates 4 bits of MSN. So we must validate that + * the MSN in the SEND is the next expected MSN. If its not, + * then we complete this with TPT_ERR_MSN and mark the wq in + * error. + */ + if (__predict_false((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) { + wq->error = 1; + hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN)); + goto proc_cqe; + } + goto proc_cqe; + } + + /* + * If we get here its a send completion. + * + * Handle out of order completion. These get stuffed + * in the SW SQ. Then the SW SQ is walked to move any + * now in-order completions into the SW CQ. This handles + * 2 cases: + * 1) reaping unsignaled WRs when the first subsequent + * signaled WR is completed. + * 2) out of order read completions. + */ + if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) { + struct t3_swsq *sqp; + + CTR2(KTR_IW_CXGB, "%s out of order completion going in swsq at idx %ld", + __FUNCTION__, + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2)); + sqp = wq->sq + + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2); + sqp->cqe = *hw_cqe; + sqp->complete = 1; + ret = -1; + goto flush_wq; + } + +proc_cqe: + *cqe = *hw_cqe; + + /* + * Reap the associated WR(s) that are freed up with this + * completion. + */ + if (SQ_TYPE(*hw_cqe)) { + wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe); + CTR2(KTR_IW_CXGB, "%s completing sq idx %ld", __FUNCTION__, + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)); + *cookie = (wq->sq + + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2))->wr_id; + wq->sq_rptr++; + } else { + CTR2(KTR_IW_CXGB, "%s completing rq idx %ld", __FUNCTION__, + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); + *cookie = *(wq->rq + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); + wq->rq_rptr++; + } + +flush_wq: + /* + * Flush any completed cqes that are now in-order. + */ + flush_completed_wrs(wq, cq); + +skip_cqe: + if (SW_CQE(*hw_cqe)) { + CTR4(KTR_IW_CXGB, "%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x", + __FUNCTION__, cq, cq->cqid, cq->sw_rptr); + ++cq->sw_rptr; + } else { + CTR4(KTR_IW_CXGB, "%s cq %p cqid 0x%x skip hw cqe rptr 0x%x", + __FUNCTION__, cq, cq->cqid, cq->rptr); + ++cq->rptr; + + /* + * T3A: compute credits. + */ + if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1))) + || ((cq->rptr - cq->wptr) >= 128)) { + *credit = cq->rptr - cq->wptr; + cq->wptr = cq->rptr; + } + } + return ret; +} + + diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h new file mode 100644 index 0000000000000..6a401e09322d7 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h @@ -0,0 +1,330 @@ +/************************************************************************** + +Copyright (c) 2007, 2008 Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ +#ifndef __CXIO_HAL_H__ +#define __CXIO_HAL_H__ +#include <sys/condvar.h> +#include <sys/ktr.h> + +#define T3_CTRL_QP_ID FW_RI_SGEEC_START +#define T3_CTL_QP_TID FW_RI_TID_START +#define T3_CTRL_QP_SIZE_LOG2 8 +#define T3_CTRL_CQ_ID 0 + +/* TBD */ +#define T3_MAX_NUM_RI (1<<15) +#define T3_MAX_NUM_QP (1<<15) +#define T3_MAX_NUM_CQ (1<<15) +#define T3_MAX_NUM_PD (1<<15) +#define T3_MAX_PBL_SIZE 256 +#define T3_MAX_RQ_SIZE 1024 +#define T3_MAX_NUM_STAG (1<<15) + +#define T3_STAG_UNSET 0xffffffff + +#define T3_MAX_DEV_NAME_LEN 32 + +struct cxio_hal_ctrl_qp { + u32 wptr; + u32 rptr; + struct mtx lock; /* for the wtpr, can sleep */ +#ifdef notyet + DECLARE_PCI_UNMAP_ADDR(mapping) +#endif + union t3_wr *workq; /* the work request queue */ + bus_addr_t dma_addr; /* pci bus address of the workq */ + void /* __iomem */ *doorbell; +}; + +struct cxio_hal_resource { + struct buf_ring *tpt_fifo; + struct mtx tpt_fifo_lock; + struct buf_ring *qpid_fifo; + struct mtx qpid_fifo_lock; + struct buf_ring *cqid_fifo; + struct mtx cqid_fifo_lock; + struct buf_ring *pdid_fifo; + struct mtx pdid_fifo_lock; +}; + +struct cxio_qpid { + TAILQ_ENTRY(cxio_qpid) entry; + u32 qpid; +}; + +struct cxio_ucontext { + TAILQ_HEAD(, cxio_qpid) qpids; + struct mtx lock; +}; + +struct cxio_rdev { + char dev_name[T3_MAX_DEV_NAME_LEN]; + struct t3cdev *t3cdev_p; + struct rdma_info rnic_info; + struct adap_ports port_info; + struct cxio_hal_resource *rscp; + struct cxio_hal_ctrl_qp ctrl_qp; + void *ulp; + unsigned long qpshift; + u32 qpnr; + u32 qpmask; + struct cxio_ucontext uctx; + struct gen_pool *pbl_pool; + struct gen_pool *rqt_pool; + struct ifnet *ifp; + TAILQ_ENTRY(cxio_rdev) entry; +}; + +static __inline int +cxio_num_stags(struct cxio_rdev *rdev_p) +{ + return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5)); +} + +typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p, + struct mbuf * m); + +#define RSPQ_CQID(rsp) (be32toh(rsp->cq_ptrid) & 0xffff) +#define RSPQ_CQPTR(rsp) ((be32toh(rsp->cq_ptrid) >> 16) & 0xffff) +#define RSPQ_GENBIT(rsp) ((be32toh(rsp->flags) >> 16) & 1) +#define RSPQ_OVERFLOW(rsp) ((be32toh(rsp->flags) >> 17) & 1) +#define RSPQ_AN(rsp) ((be32toh(rsp->flags) >> 18) & 1) +#define RSPQ_SE(rsp) ((be32toh(rsp->flags) >> 19) & 1) +#define RSPQ_NOTIFY(rsp) ((be32toh(rsp->flags) >> 20) & 1) +#define RSPQ_CQBRANCH(rsp) ((be32toh(rsp->flags) >> 21) & 1) +#define RSPQ_CREDIT_THRESH(rsp) ((be32toh(rsp->flags) >> 22) & 1) + +struct respQ_msg_t { + __be32 flags; /* flit 0 */ + __be32 cq_ptrid; + __be64 rsvd; /* flit 1 */ + struct t3_cqe cqe; /* flits 2-3 */ +}; + +enum t3_cq_opcode { + CQ_ARM_AN = 0x2, + CQ_ARM_SE = 0x6, + CQ_FORCE_AN = 0x3, + CQ_CREDIT_UPDATE = 0x7 +}; + +int cxio_rdev_open(struct cxio_rdev *rdev); +void cxio_rdev_close(struct cxio_rdev *rdev); +int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit); +int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode); +int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr); +int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr); +int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size, + u32 pbl_addr); +int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid); +int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag); +int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr); +void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp); +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid); +int cxio_hal_init(void); +void cxio_hal_exit(void); +void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count); +void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count); +void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_flush_hw_cq(struct t3_cq *cq); +int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit); + +#define MOD "iw_cxgb: " + +#ifdef DEBUG +void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag); +void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint32_t len, u8 shift); +void cxio_dump_wqe(union t3_wr *wqe); +void cxio_dump_wce(struct t3_cqe *wce); +void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents); +void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid); +#endif + + + static unsigned char hiBitSetTab[] = { + 0, 1, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7 + +}; + + +static __inline +int ilog2(unsigned long val) +{ + unsigned long tmp; + + tmp = val >> 24; + if (tmp) { + return hiBitSetTab[tmp] + 23; + } + tmp = (val >> 16) & 0xff; + if (tmp) { + return hiBitSetTab[tmp] + 15; + } + tmp = (val >> 8) & 0xff; + if (tmp) { + return hiBitSetTab[tmp] + 7; + + } + return hiBitSetTab[val & 0xff] - 1; +} + +#define cxfree(a) free((a), M_DEVBUF); +#define kmalloc(a, b) malloc((a), M_DEVBUF, (b)) +#define kzalloc(a, b) malloc((a), M_DEVBUF, (b)|M_ZERO) + +static __inline __attribute__((const)) +unsigned long roundup_pow_of_two(unsigned long n) +{ + return 1UL << flsl(n - 1); +} + +#define PAGE_ALIGN(x) roundup2((x), PAGE_SIZE) + +#include <sys/blist.h> +struct gen_pool { + blist_t gen_list; + daddr_t gen_base; + int gen_chunk_shift; + struct mtx gen_lock; +}; + +static __inline struct gen_pool * +gen_pool_create(daddr_t base, u_int chunk_shift, u_int len) +{ + struct gen_pool *gp; + + gp = malloc(sizeof(struct gen_pool), M_DEVBUF, M_NOWAIT); + if (gp == NULL) + return (NULL); + + gp->gen_list = blist_create(len >> chunk_shift, M_NOWAIT); + if (gp->gen_list == NULL) { + free(gp, M_DEVBUF); + return (NULL); + } + blist_free(gp->gen_list, 0, len >> chunk_shift); + gp->gen_base = base; + gp->gen_chunk_shift = chunk_shift; + mtx_init(&gp->gen_lock, "genpool", NULL, MTX_DUPOK|MTX_DEF); + + return (gp); +} + +static __inline unsigned long +gen_pool_alloc(struct gen_pool *gp, int size) +{ + int chunks; + daddr_t blkno; + + chunks = (size + (1<<gp->gen_chunk_shift) - 1) >> gp->gen_chunk_shift; + mtx_lock(&gp->gen_lock); + blkno = blist_alloc(gp->gen_list, chunks); + mtx_unlock(&gp->gen_lock); + + if (blkno == SWAPBLK_NONE) + return (0); + + return (gp->gen_base + ((1 << gp->gen_chunk_shift) * blkno)); +} + +static __inline void +gen_pool_free(struct gen_pool *gp, daddr_t address, int size) +{ + int chunks; + daddr_t blkno; + + chunks = (size + (1<<gp->gen_chunk_shift) - 1) >> gp->gen_chunk_shift; + blkno = (address - gp->gen_base) / (1 << gp->gen_chunk_shift); + mtx_lock(&gp->gen_lock); + blist_free(gp->gen_list, blkno, chunks); + mtx_unlock(&gp->gen_lock); +} + +static __inline void +gen_pool_destroy(struct gen_pool *gp) +{ + blist_destroy(gp->gen_list); + free(gp, M_DEVBUF); +} + +#define cxio_wait(ctx, lockp, cond) \ +({ \ + int __ret = 0; \ + mtx_lock(lockp); \ + while (!cond) { \ + msleep(ctx, lockp, 0, "cxio_wait", hz); \ + if (SIGPENDING(curthread)) { \ + __ret = ERESTART; \ + break; \ + } \ + } \ + mtx_unlock(lockp); \ + __ret; \ +}) +extern struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev); + +#define KTR_IW_CXGB KTR_SPARE4 + +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c new file mode 100644 index 0000000000000..df06f87a9805c --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_mem.c @@ -0,0 +1,219 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/libkern.h> + +#include <netinet/in.h> + +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/ib_umem.h> +#include <contrib/rdma/ib_user_verbs.h> + + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#include <ulp/iw_cxgb/iw_cxgb_resource.h> +#include <ulp/iw_cxgb/iw_cxgb_user.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h> +#endif + + +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list) +{ + u32 stag; + u32 mmid; + + + if (cxio_register_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift-12, + page_list, + &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + return (-ENOMEM); + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + insert_handle(rhp, &rhp->mmidr, mhp, mmid); + CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __FUNCTION__, mmid, mhp); + return 0; +} + +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list, + int npages) +{ + u32 stag; + u32 mmid; + + + /* We could support this... */ + if (npages > mhp->attr.pbl_size) + return (-ENOMEM); + + stag = mhp->attr.stag; + if (cxio_reregister_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift-12, + page_list, + &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + return (-ENOMEM); + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + insert_handle(rhp, &rhp->mmidr, mhp, mmid); + CTR3(KTR_IW_CXGB, "%s mmid 0x%x mhp %p", __FUNCTION__, mmid, mhp); + return 0; +} + +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list) +{ + u64 mask; + int i, j, n; + + mask = 0; + *total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0 && buffer_list[i].addr & ~PAGE_MASK) + return (-EINVAL); + if (i != 0 && i != num_phys_buf - 1 && + (buffer_list[i].size & ~PAGE_MASK)) + return (-EINVAL); + *total_size += buffer_list[i].size; + if (i > 0) + mask |= buffer_list[i].addr; + else + mask |= buffer_list[i].addr & PAGE_MASK; + if (i != num_phys_buf - 1) + mask |= buffer_list[i].addr + buffer_list[i].size; + else + mask |= (buffer_list[i].addr + buffer_list[i].size + + PAGE_SIZE - 1) & PAGE_MASK; + } + + if (*total_size > 0xFFFFFFFFULL) + return (-ENOMEM); + + /* Find largest page shift we can use to cover buffers */ + for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift)) + if ((1ULL << *shift) & mask) + break; + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1); + buffer_list[0].addr &= ~0ull << *shift; + + *npages = 0; + for (i = 0; i < num_phys_buf; ++i) + *npages += (buffer_list[i].size + + (1ULL << *shift) - 1) >> *shift; + + if (!*npages) + return (-EINVAL); + + *page_list = kmalloc(sizeof(u64) * *npages, M_NOWAIT); + if (!*page_list) + return (-ENOMEM); + + n = 0; + for (i = 0; i < num_phys_buf; ++i) + for (j = 0; + j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift; + ++j) + (*page_list)[n++] = htobe64(buffer_list[i].addr + + ((u64) j << *shift)); + + CTR6(KTR_IW_CXGB, "%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d", + __FUNCTION__, (unsigned long long) *iova_start, + (unsigned long long) mask, *shift, (unsigned long long) *total_size, + *npages); + + return 0; + +} diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c new file mode 100644 index 0000000000000..4ef7dc5f8c979 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.c @@ -0,0 +1,1295 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/queue.h> + +#include <netinet/in.h> + + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/ib_umem.h> +#include <contrib/rdma/ib_user_verbs.h> + + + + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#include <ulp/iw_cxgb/iw_cxgb_resource.h> +#include <ulp/iw_cxgb/iw_cxgb_user.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h> +#endif + + +static int +iwch_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + return (-ENOSYS); +} + +static struct ib_ah * +iwch_ah_create(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + +static int +iwch_ah_destroy(struct ib_ah *ah) +{ + return (-ENOSYS); +} + +static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return (-ENOSYS); +} + +static int +iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return (-ENOSYS); +} + +static int +iwch_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + return (-ENOSYS); +} + +static int +iwch_dealloc_ucontext(struct ib_ucontext *context) +{ + struct iwch_dev *rhp = to_iwch_dev(context->device); + struct iwch_ucontext *ucontext = to_iwch_ucontext(context); + struct iwch_mm_entry *mm, *tmp; + + CTR2(KTR_IW_CXGB, "%s context %p", __FUNCTION__, context); + TAILQ_FOREACH_SAFE(mm, &ucontext->mmaps, entry, tmp) { + TAILQ_REMOVE(&ucontext->mmaps, mm, entry); + cxfree(mm); + } + cxio_release_ucontext(&rhp->rdev, &ucontext->uctx); + cxfree(ucontext); + return 0; +} + +static struct ib_ucontext * +iwch_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) +{ + struct iwch_ucontext *context; + struct iwch_dev *rhp = to_iwch_dev(ibdev); + + CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); + context = malloc(sizeof(*context), M_DEVBUF, M_ZERO|M_NOWAIT); + if (!context) + return ERR_PTR(-ENOMEM); + cxio_init_ucontext(&rhp->rdev, &context->uctx); + TAILQ_INIT(&context->mmaps); + mtx_init(&context->mmap_lock, "ucontext mmap", NULL, MTX_DEF); + return &context->ibucontext; +} + +static int +iwch_destroy_cq(struct ib_cq *ib_cq) +{ + struct iwch_cq *chp; + + CTR2(KTR_IW_CXGB, "%s ib_cq %p", __FUNCTION__, ib_cq); + chp = to_iwch_cq(ib_cq); + + remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid); + mtx_lock(&chp->lock); + if (--chp->refcnt) + msleep(chp, &chp->lock, 0, "iwch_destroy_cq", 0); + mtx_unlock(&chp->lock); + + cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); + cxfree(chp); + return 0; +} + +static struct ib_cq * +iwch_create_cq(struct ib_device *ibdev, int entries, int vector, + struct ib_ucontext *ib_context, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + struct iwch_create_cq_resp uresp; + struct iwch_create_cq_req ureq; + struct iwch_ucontext *ucontext = NULL; + + CTR3(KTR_IW_CXGB, "%s ib_dev %p entries %d", __FUNCTION__, ibdev, entries); + rhp = to_iwch_dev(ibdev); + chp = malloc(sizeof(*chp), M_DEVBUF, M_NOWAIT|M_ZERO); + if (!chp) { + return ERR_PTR(-ENOMEM); + } + if (ib_context) { + ucontext = to_iwch_ucontext(ib_context); + if (!t3a_device(rhp)) { + if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) { + cxfree(chp); + return ERR_PTR(-EFAULT); + } + chp->user_rptr_addr = (u32 /*__user */*)(unsigned long)ureq.user_rptr_addr; + } + } + + if (t3a_device(rhp)) { + + /* + * T3A: Add some fluff to handle extra CQEs inserted + * for various errors. + * Additional CQE possibilities: + * TERMINATE, + * incoming RDMA WRITE Failures + * incoming RDMA READ REQUEST FAILUREs + * NOTE: We cannot ensure the CQ won't overflow. + */ + entries += 16; + } + entries = roundup_pow_of_two(entries); + chp->cq.size_log2 = ilog2(entries); + + if (cxio_create_cq(&rhp->rdev, &chp->cq)) { + cxfree(chp); + return ERR_PTR(-ENOMEM); + } + chp->rhp = rhp; + chp->ibcq.cqe = 1 << chp->cq.size_log2; + mtx_init(&chp->lock, "cxgb cq", NULL, MTX_DEF|MTX_DUPOK); + chp->refcnt = 1; + insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); + + if (ucontext) { + struct iwch_mm_entry *mm; + + mm = kmalloc(sizeof *mm, M_NOWAIT); + if (!mm) { + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-ENOMEM); + } + uresp.cqid = chp->cq.cqid; + uresp.size_log2 = chp->cq.size_log2; + mtx_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + mtx_unlock(&ucontext->mmap_lock); + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + cxfree(mm); + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-EFAULT); + } + mm->key = uresp.key; + mm->addr = vtophys(chp->cq.queue); + mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * + sizeof (struct t3_cqe)); + insert_mmap(ucontext, mm); + } + CTR4(KTR_IW_CXGB, "created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx", + chp->cq.cqid, chp, (1 << chp->cq.size_log2), + (unsigned long long) chp->cq.dma_addr); + return &chp->ibcq; +} + +static int +iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ +#ifdef notyet + struct iwch_cq *chp = to_iwch_cq(cq); + struct t3_cq oldcq, newcq; + int ret; + + CTR3(KTR_IW_CXGB, "%s ib_cq %p cqe %d", __FUNCTION__, cq, cqe); + + /* We don't downsize... */ + if (cqe <= cq->cqe) + return 0; + + /* create new t3_cq with new size */ + cqe = roundup_pow_of_two(cqe+1); + newcq.size_log2 = ilog2(cqe); + + /* Dont allow resize to less than the current wce count */ + if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) { + return (-ENOMEM); + } + + /* Quiesce all QPs using this CQ */ + ret = iwch_quiesce_qps(chp); + if (ret) { + return (ret); + } + + ret = cxio_create_cq(&chp->rhp->rdev, &newcq); + if (ret) { + return (ret); + } + + /* copy CQEs */ + memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) * + sizeof(struct t3_cqe)); + + /* old iwch_qp gets new t3_cq but keeps old cqid */ + oldcq = chp->cq; + chp->cq = newcq; + chp->cq.cqid = oldcq.cqid; + + /* resize new t3_cq to update the HW context */ + ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq); + if (ret) { + chp->cq = oldcq; + return ret; + } + chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1; + + /* destroy old t3_cq */ + oldcq.cqid = newcq.cqid; + ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq); + if (ret) { + log(LOG_ERR, "%s - cxio_destroy_cq failed %d\n", + __FUNCTION__, ret); + } + + /* add user hooks here */ + + /* resume qps */ + ret = iwch_resume_qps(chp); + return ret; +#else + return (-ENOSYS); +#endif +} + +static int +iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + enum t3_cq_opcode cq_op; + int err; + u32 rptr; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + cq_op = CQ_ARM_SE; + else + cq_op = CQ_ARM_AN; + if (chp->user_rptr_addr) { + if (copyin(&rptr, chp->user_rptr_addr, 4)) + return (-EFAULT); + mtx_lock(&chp->lock); + chp->cq.rptr = rptr; + } else + mtx_lock(&chp->lock); + CTR2(KTR_IW_CXGB, "%s rptr 0x%x", __FUNCTION__, chp->cq.rptr); + err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0); + mtx_unlock(&chp->lock); + if (err < 0) + log(LOG_ERR, "Error %d rearming CQID 0x%x\n", err, + chp->cq.cqid); + if (err > 0 && !(flags & IB_CQ_REPORT_MISSED_EVENTS)) + err = 0; + return err; +} + +#ifdef notyet +static int +iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ +#ifdef notyet + int len = vma->vm_end - vma->vm_start; + u32 key = vma->vm_pgoff << PAGE_SHIFT; + struct cxio_rdev *rdev_p; + int ret = 0; + struct iwch_mm_entry *mm; + struct iwch_ucontext *ucontext; + u64 addr; + + CTR4(KTR_IW_CXGB, "%s pgoff 0x%lx key 0x%x len %d", __FUNCTION__, vma->vm_pgoff, + key, len); + + if (vma->vm_start & (PAGE_SIZE-1)) { + return (-EINVAL); + } + + rdev_p = &(to_iwch_dev(context->device)->rdev); + ucontext = to_iwch_ucontext(context); + + mm = remove_mmap(ucontext, key, len); + if (!mm) + return (-EINVAL); + addr = mm->addr; + cxfree(mm); + + if ((addr >= rdev_p->rnic_info.udbell_physbase) && + (addr < (rdev_p->rnic_info.udbell_physbase + + rdev_p->rnic_info.udbell_len))) { + + /* + * Map T3 DB register. + */ + if (vma->vm_flags & VM_READ) { + return (-EPERM); + } + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_flags &= ~VM_MAYREAD; + ret = io_remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else { + + /* + * Map WQ or CQ contig dma memory... + */ + ret = remap_pfn_range(vma, vma->vm_start, + addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + + return ret; +#endif + return (0); +} +#endif + +static int iwch_deallocate_pd(struct ib_pd *pd) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + + php = to_iwch_pd(pd); + rhp = php->rhp; + CTR3(KTR_IW_CXGB, "%s ibpd %p pdid 0x%x", __FUNCTION__, pd, php->pdid); + cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid); + cxfree(php); + return 0; +} + +static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct iwch_pd *php; + u32 pdid; + struct iwch_dev *rhp; + + CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); + rhp = (struct iwch_dev *) ibdev; + pdid = cxio_hal_get_pdid(rhp->rdev.rscp); + if (!pdid) + return ERR_PTR(-EINVAL); + php = malloc(sizeof(*php), M_DEVBUF, M_ZERO|M_NOWAIT); + if (!php) { + cxio_hal_put_pdid(rhp->rdev.rscp, pdid); + return ERR_PTR(-ENOMEM); + } + php->pdid = pdid; + php->rhp = rhp; + if (context) { + if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) { + iwch_deallocate_pd(&php->ibpd); + return ERR_PTR(-EFAULT); + } + } + CTR3(KTR_IW_CXGB, "%s pdid 0x%0x ptr 0x%p", __FUNCTION__, pdid, php); + return &php->ibpd; +} + +static int iwch_dereg_mr(struct ib_mr *ib_mr) +{ + struct iwch_dev *rhp; + struct iwch_mr *mhp; + u32 mmid; + + CTR2(KTR_IW_CXGB, "%s ib_mr %p", __FUNCTION__, ib_mr); + /* There can be no memory windows */ + if (atomic_load_acq_int(&ib_mr->usecnt)) + return (-EINVAL); + + mhp = to_iwch_mr(ib_mr); + rhp = mhp->rhp; + mmid = mhp->attr.stag >> 8; + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + remove_handle(rhp, &rhp->mmidr, mmid); + if (mhp->kva) + cxfree((void *) (unsigned long) mhp->kva); + if (mhp->umem) + ib_umem_release(mhp->umem); + CTR3(KTR_IW_CXGB, "%s mmid 0x%x ptr %p", __FUNCTION__, mmid, mhp); + cxfree(mhp); + return 0; +} + +static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, + u64 *iova_start) +{ + __be64 *page_list; + int shift; + u64 total_size; + int npages; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + int ret; + + CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd); + php = to_iwch_pd(pd); + rhp = php->rhp; + + mhp = malloc(sizeof(*mhp), M_DEVBUF, M_ZERO|M_NOWAIT); + if (!mhp) + return ERR_PTR(-ENOMEM); + + /* First check that we have enough alignment */ + if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + if (num_phys_buf > 1 && + ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, + &total_size, &npages, &shift, &page_list); + if (ret) + goto err; + + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + + mhp->attr.perms = iwch_ib_to_tpt_access(acc); + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + ret = iwch_register_mem(rhp, php, mhp, shift, page_list); + cxfree(page_list); + if (ret) { + goto err; + } + return &mhp->ibmr; +err: + cxfree(mhp); + return ERR_PTR(-ret); + +} + +static int iwch_reregister_phys_mem(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, u64 * iova_start) +{ + + struct iwch_mr mh, *mhp; + struct iwch_pd *php; + struct iwch_dev *rhp; + __be64 *page_list = NULL; + int shift = 0; + u64 total_size; + int npages; + int ret; + + CTR3(KTR_IW_CXGB, "%s ib_mr %p ib_pd %p", __FUNCTION__, mr, pd); + + /* There can be no memory windows */ + if (atomic_load_acq_int(&mr->usecnt)) + return (-EINVAL); + + mhp = to_iwch_mr(mr); + rhp = mhp->rhp; + php = to_iwch_pd(mr->pd); + + /* make sure we are on the same adapter */ + if (rhp != php->rhp) + return (-EINVAL); + + memcpy(&mh, mhp, sizeof *mhp); + + if (mr_rereg_mask & IB_MR_REREG_PD) + php = to_iwch_pd(pd); + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mh.attr.perms = iwch_ib_to_tpt_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + ret = build_phys_page_list(buffer_list, num_phys_buf, + iova_start, + &total_size, &npages, + &shift, &page_list); + if (ret) + return ret; + } + + ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages); + cxfree(page_list); + if (ret) { + return ret; + } + if (mr_rereg_mask & IB_MR_REREG_PD) + mhp->attr.pdid = php->pdid; + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mhp->attr.perms = iwch_ib_to_tpt_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + mhp->attr.zbva = 0; + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + } + + return 0; +} + + +static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + __be64 *pages; + int shift, i, n; + int err = 0; + struct ib_umem_chunk *chunk; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + struct iwch_reg_user_mr_resp uresp; +#ifdef notyet + int j, k, len; +#endif + + CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd); + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = malloc(sizeof(*mhp), M_DEVBUF, M_NOWAIT|M_ZERO); + if (!mhp) + return ERR_PTR(-ENOMEM); + + mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc); + if (IS_ERR(mhp->umem)) { + err = PTR_ERR(mhp->umem); + cxfree(mhp); + return ERR_PTR(-err); + } + + shift = ffs(mhp->umem->page_size) - 1; + + n = 0; + TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry) + n += chunk->nents; + + pages = kmalloc(n * sizeof(u64), M_NOWAIT); + if (!pages) { + err = -ENOMEM; + goto err; + } + + i = n = 0; + +#if 0 + TAILQ_FOREACH(chunk, &mhp->umem->chunk_list, entry) + for (j = 0; j < chunk->nmap; ++j) { + len = sg_dma_len(&chunk->page_list[j]) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = htobe64(sg_dma_address( + &chunk->page_list[j]) + + mhp->umem->page_size * k); + } + } +#endif + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + mhp->attr.perms = iwch_ib_to_tpt_access(acc); + mhp->attr.va_fbo = virt; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) length; + mhp->attr.pbl_size = i; + err = iwch_register_mem(rhp, php, mhp, shift, pages); + cxfree(pages); + if (err) + goto err; + + if (udata && !t3a_device(rhp)) { + uresp.pbl_addr = (mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3; + CTR2(KTR_IW_CXGB, "%s user resp pbl_addr 0x%x", __FUNCTION__, + uresp.pbl_addr); + + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + iwch_dereg_mr(&mhp->ibmr); + err = EFAULT; + goto err; + } + } + + return &mhp->ibmr; + +err: + ib_umem_release(mhp->umem); + cxfree(mhp); + return ERR_PTR(-err); +} + +static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva; + struct ib_mr *ibmr; + + CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd); + + /* + * T3 only supports 32 bits of size. + */ + bl.size = 0xffffffff; + bl.addr = 0; + kva = 0; + ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva); + return ibmr; +} + +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mw *mhp; + u32 mmid; + u32 stag = 0; + int ret; + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = malloc(sizeof(*mhp), M_DEVBUF, M_ZERO|M_NOWAIT); + if (!mhp) + return ERR_PTR(-ENOMEM); + ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid); + if (ret) { + cxfree(mhp); + return ERR_PTR(-ret); + } + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.type = TPT_MW; + mhp->attr.stag = stag; + mmid = (stag) >> 8; + insert_handle(rhp, &rhp->mmidr, mhp, mmid); + CTR4(KTR_IW_CXGB, "%s mmid 0x%x mhp %p stag 0x%x", __FUNCTION__, mmid, mhp, stag); + return &(mhp->ibmw); +} + +static int iwch_dealloc_mw(struct ib_mw *mw) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + u32 mmid; + + mhp = to_iwch_mw(mw); + rhp = mhp->rhp; + mmid = (mw->rkey) >> 8; + cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); + remove_handle(rhp, &rhp->mmidr, mmid); + cxfree(mhp); + CTR4(KTR_IW_CXGB, "%s ib_mw %p mmid 0x%x ptr %p", __FUNCTION__, mw, mmid, mhp); + return 0; +} + +static int iwch_destroy_qp(struct ib_qp *ib_qp) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_qp_attributes attrs; + struct iwch_ucontext *ucontext; + + qhp = to_iwch_qp(ib_qp); + rhp = qhp->rhp; + + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0); + mtx_lock(&qhp->lock); + if (qhp->ep) + msleep(qhp, &qhp->lock, 0, "iwch_destroy_qp1", 0); + mtx_unlock(&qhp->lock); + + remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid); + + mtx_lock(&qhp->lock); + if (--qhp->refcnt) + msleep(qhp, &qhp->lock, 0, "iwch_destroy_qp2", 0); + mtx_unlock(&qhp->lock); + + ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context) + : NULL; + cxio_destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + + CTR4(KTR_IW_CXGB, "%s ib_qp %p qpid 0x%0x qhp %p", __FUNCTION__, + ib_qp, qhp->wq.qpid, qhp); + cxfree(qhp); + return 0; +} + +static struct ib_qp *iwch_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_pd *php; + struct iwch_cq *schp; + struct iwch_cq *rchp; + struct iwch_create_qp_resp uresp; + int wqsize, sqsize, rqsize; + struct iwch_ucontext *ucontext; + + CTR2(KTR_IW_CXGB, "%s ib_pd %p", __FUNCTION__, pd); + if (attrs->qp_type != IB_QPT_RC) + return ERR_PTR(-EINVAL); + php = to_iwch_pd(pd); + rhp = php->rhp; + schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid); + rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid); + if (!schp || !rchp) + return ERR_PTR(-EINVAL); + + /* The RQT size must be # of entries + 1 rounded up to a power of two */ + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr); + if (rqsize == attrs->cap.max_recv_wr) + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1); + + /* T3 doesn't support RQT depth < 16 */ + if (rqsize < 16) + rqsize = 16; + + if (rqsize > T3_MAX_RQ_SIZE) + return ERR_PTR(-EINVAL); + + if (attrs->cap.max_inline_data > T3_MAX_INLINE) + return ERR_PTR(-EINVAL); + + /* + * NOTE: The SQ and total WQ sizes don't need to be + * a power of two. However, all the code assumes + * they are. EG: Q_FREECNT() and friends. + */ + sqsize = roundup_pow_of_two(attrs->cap.max_send_wr); + wqsize = roundup_pow_of_two(rqsize + sqsize); + CTR4(KTR_IW_CXGB, "%s wqsize %d sqsize %d rqsize %d", __FUNCTION__, + wqsize, sqsize, rqsize); + qhp = malloc(sizeof(*qhp), M_DEVBUF, M_ZERO|M_NOWAIT); + if (!qhp) + return ERR_PTR(-ENOMEM); + qhp->wq.size_log2 = ilog2(wqsize); + qhp->wq.rq_size_log2 = ilog2(rqsize); + qhp->wq.sq_size_log2 = ilog2(sqsize); + ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL; + if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) { + cxfree(qhp); + return ERR_PTR(-ENOMEM); + } + + attrs->cap.max_recv_wr = rqsize - 1; + attrs->cap.max_send_wr = sqsize; + attrs->cap.max_inline_data = T3_MAX_INLINE; + + qhp->rhp = rhp; + qhp->attr.pd = php->pdid; + qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid; + qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid; + qhp->attr.sq_num_entries = attrs->cap.max_send_wr; + qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; + qhp->attr.sq_max_sges = attrs->cap.max_send_sge; + qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; + qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.next_state = IWCH_QP_STATE_IDLE; + + /* + * XXX - These don't get passed in from the openib user + * at create time. The CM sets them via a QP modify. + * Need to fix... I think the CM should + */ + qhp->attr.enable_rdma_read = 1; + qhp->attr.enable_rdma_write = 1; + qhp->attr.enable_bind = 1; + qhp->attr.max_ord = 1; + qhp->attr.max_ird = 1; + + mtx_init(&qhp->lock, "cxgb qp", NULL, MTX_DEF|MTX_DUPOK); + qhp->refcnt = 1; + insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid); + + if (udata) { + + struct iwch_mm_entry *mm1, *mm2; + + mm1 = kmalloc(sizeof *mm1, M_NOWAIT); + if (!mm1) { + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + mm2 = kmalloc(sizeof *mm2, M_NOWAIT); + if (!mm2) { + cxfree(mm1); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + uresp.qpid = qhp->wq.qpid; + uresp.size_log2 = qhp->wq.size_log2; + uresp.sq_size_log2 = qhp->wq.sq_size_log2; + uresp.rq_size_log2 = qhp->wq.rq_size_log2; + mtx_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.db_key = ucontext->key; + ucontext->key += PAGE_SIZE; + mtx_unlock(&ucontext->mmap_lock); + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + cxfree(mm1); + cxfree(mm2); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-EFAULT); + } + mm1->key = uresp.key; + mm1->addr = vtophys(qhp->wq.queue); + mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr)); + insert_mmap(ucontext, mm1); + mm2->key = uresp.db_key; + mm2->addr = qhp->wq.udb & PAGE_MASK; + mm2->len = PAGE_SIZE; + insert_mmap(ucontext, mm2); + } + qhp->ibqp.qp_num = qhp->wq.qpid; + callout_init(&(qhp->timer), TRUE); + CTR6(KTR_IW_CXGB, "sq_num_entries %d, rq_num_entries %d " + "qpid 0x%0x qhp %p dma_addr 0x%llx size %d", + qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, + qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr, + 1 << qhp->wq.size_log2); + return &qhp->ibqp; +} + +static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + enum iwch_qp_attr_mask mask = 0; + struct iwch_qp_attributes attrs; + + CTR2(KTR_IW_CXGB, "%s ib_qp %p", __FUNCTION__, ibqp); + + /* iwarp does not support the RTR state */ + if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) + attr_mask &= ~IB_QP_STATE; + + /* Make sure we still have something left to do */ + if (!attr_mask) + return 0; + + memset(&attrs, 0, sizeof attrs); + qhp = to_iwch_qp(ibqp); + rhp = qhp->rhp; + + attrs.next_state = iwch_convert_state(attr->qp_state); + attrs.enable_rdma_read = (attr->qp_access_flags & + IB_ACCESS_REMOTE_READ) ? 1 : 0; + attrs.enable_rdma_write = (attr->qp_access_flags & + IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; + + + mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0; + mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? + (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0; + + return iwch_modify_qp(rhp, qhp, mask, &attrs, 0); +} + +void iwch_qp_add_ref(struct ib_qp *qp) +{ + CTR2(KTR_IW_CXGB, "%s ib_qp %p", __FUNCTION__, qp); + mtx_lock(&to_iwch_qp(qp)->lock); + to_iwch_qp(qp)->refcnt++; + mtx_unlock(&to_iwch_qp(qp)->lock); +} + +void iwch_qp_rem_ref(struct ib_qp *qp) +{ + CTR2(KTR_IW_CXGB, "%s ib_qp %p", __FUNCTION__, qp); + mtx_lock(&to_iwch_qp(qp)->lock); + if (--to_iwch_qp(qp)->refcnt == 0) + wakeup(to_iwch_qp(qp)); + mtx_unlock(&to_iwch_qp(qp)->lock); +} + +static struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn) +{ + CTR3(KTR_IW_CXGB, "%s ib_dev %p qpn 0x%x", __FUNCTION__, dev, qpn); + return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn); +} + + +static int iwch_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 * pkey) +{ + CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); + *pkey = 0; + return 0; +} + +static int iwch_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct iwch_dev *dev; + struct port_info *pi; + + CTR5(KTR_IW_CXGB, "%s ibdev %p, port %d, index %d, gid %p", + __FUNCTION__, ibdev, port, index, gid); + dev = to_iwch_dev(ibdev); + PANIC_IF(port == 0 || port > 2); + pi = ((struct port_info *)dev->rdev.port_info.lldevs[port-1]->if_softc); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), pi->hw_addr, 6); + return 0; +} + +static int iwch_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + + struct iwch_dev *dev; + CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); + + dev = to_iwch_dev(ibdev); + memset(props, 0, sizeof *props); +#ifdef notyet + memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->if_addr.ifa_addr, 6); +#endif + props->device_cap_flags = dev->device_cap_flags; +#ifdef notyet + props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor; + props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device; +#endif + props->max_mr_size = ~0ull; + props->max_qp = dev->attr.max_qps; + props->max_qp_wr = dev->attr.max_wrs; + props->max_sge = dev->attr.max_sge_per_wr; + props->max_sge_rd = 1; + props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp; + props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp; + props->max_cq = dev->attr.max_cqs; + props->max_cqe = dev->attr.max_cqes_per_cq; + props->max_mr = dev->attr.max_mem_regs; + props->max_pd = dev->attr.max_pds; + props->local_ca_ack_delay = 0; + + return 0; +} + +static int iwch_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + CTR2(KTR_IW_CXGB, "%s ibdev %p", __FUNCTION__, ibdev); + props->max_mtu = IB_MTU_4096; + props->lid = 0; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + props->state = IB_PORT_ACTIVE; + props->phys_state = 0; + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->qkey_viol_cntr = 0; + props->active_width = 2; + props->active_speed = 2; + props->max_msg_sz = -1; + + return 0; +} + +#ifdef notyet +static ssize_t show_rev(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev); + return sprintf(buf, "%d\n", dev->rdev.t3cdev_p->type); +} + +static ssize_t show_fw_ver(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + struct ethtool_drvinfo info; + struct net_device *lldev = dev->rdev.t3cdev_p->lldev; + + CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.fw_version); +} + +static ssize_t show_hca(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + struct ethtool_drvinfo info; + struct net_device *lldev = dev->rdev.t3cdev_p->lldev; + + CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, cdev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.driver); +} + +static ssize_t show_board(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + CTR2(KTR_IW_CXGB, "%s class dev 0x%p", __FUNCTION__, dev); +#ifdef notyet + return sprintf(buf, "%x.%x\n", dev->rdev.rnic_info.pdev->vendor, + dev->rdev.rnic_info.pdev->device); +#else + return sprintf(buf, "%x.%x\n", 0xdead, 0xbeef); /* XXX */ +#endif +} + +static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct class_device_attribute *iwch_class_attributes[] = { + &class_device_attr_hw_rev, + &class_device_attr_fw_ver, + &class_device_attr_hca_type, + &class_device_attr_board_id +}; +#endif + +int iwch_register_device(struct iwch_dev *dev) +{ + int ret; +#ifdef notyet + int i; +#endif + CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, dev); + strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); +#ifdef notyet + memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); +#endif + dev->device_cap_flags = + (IB_DEVICE_ZERO_STAG | + IB_DEVICE_SEND_W_INV | IB_DEVICE_MEM_WINDOW); + + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + dev->ibdev.node_type = RDMA_NODE_RNIC; + memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC)); + dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports; + dev->ibdev.num_comp_vectors = 1; + dev->ibdev.dma_device = dev->rdev.rnic_info.pdev; + dev->ibdev.query_device = iwch_query_device; + dev->ibdev.query_port = iwch_query_port; + dev->ibdev.modify_port = iwch_modify_port; + dev->ibdev.query_pkey = iwch_query_pkey; + dev->ibdev.query_gid = iwch_query_gid; + dev->ibdev.alloc_ucontext = iwch_alloc_ucontext; + dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext; +#ifdef notyet + dev->ibdev.mmap = iwch_mmap; +#endif + dev->ibdev.alloc_pd = iwch_allocate_pd; + dev->ibdev.dealloc_pd = iwch_deallocate_pd; + dev->ibdev.create_ah = iwch_ah_create; + dev->ibdev.destroy_ah = iwch_ah_destroy; + dev->ibdev.create_qp = iwch_create_qp; + dev->ibdev.modify_qp = iwch_ib_modify_qp; + dev->ibdev.destroy_qp = iwch_destroy_qp; + dev->ibdev.create_cq = iwch_create_cq; + dev->ibdev.destroy_cq = iwch_destroy_cq; + dev->ibdev.resize_cq = iwch_resize_cq; + dev->ibdev.poll_cq = iwch_poll_cq; + dev->ibdev.get_dma_mr = iwch_get_dma_mr; + dev->ibdev.reg_phys_mr = iwch_register_phys_mem; + dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem; + dev->ibdev.reg_user_mr = iwch_reg_user_mr; + dev->ibdev.dereg_mr = iwch_dereg_mr; + dev->ibdev.alloc_mw = iwch_alloc_mw; + dev->ibdev.bind_mw = iwch_bind_mw; + dev->ibdev.dealloc_mw = iwch_dealloc_mw; + + dev->ibdev.attach_mcast = iwch_multicast_attach; + dev->ibdev.detach_mcast = iwch_multicast_detach; + dev->ibdev.process_mad = iwch_process_mad; + + dev->ibdev.req_notify_cq = iwch_arm_cq; + dev->ibdev.post_send = iwch_post_send; + dev->ibdev.post_recv = iwch_post_receive; + + + dev->ibdev.iwcm = + (struct iw_cm_verbs *) kmalloc(sizeof(struct iw_cm_verbs), + M_NOWAIT); + dev->ibdev.iwcm->connect = iwch_connect; + dev->ibdev.iwcm->accept = iwch_accept_cr; + dev->ibdev.iwcm->reject = iwch_reject_cr; + dev->ibdev.iwcm->create_listen = iwch_create_listen; + dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen; + dev->ibdev.iwcm->add_ref = iwch_qp_add_ref; + dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref; + dev->ibdev.iwcm->get_qp = iwch_get_qp; + + ret = ib_register_device(&dev->ibdev); + if (ret) + goto bail1; +#ifdef notyet + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) { + ret = class_device_create_file(&dev->ibdev.class_dev, + iwch_class_attributes[i]); + if (ret) { + goto bail2; + } + } +#endif + return 0; +#ifdef notyet +bail2: +#endif + ib_unregister_device(&dev->ibdev); +bail1: + return ret; +} + +void iwch_unregister_device(struct iwch_dev *dev) +{ +#ifdef notyet + int i; + + CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, dev); + + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) + class_device_remove_file(&dev->ibdev.class_dev, + iwch_class_attributes[i]); +#endif + ib_unregister_device(&dev->ibdev); + return; +} diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h new file mode 100644 index 0000000000000..c857ce8e5b8f0 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h @@ -0,0 +1,362 @@ +/************************************************************************** + +Copyright (c) 2007, 2008 Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ +#ifndef __IWCH_PROVIDER_H__ +#define __IWCH_PROVIDER_H__ + +#include <contrib/rdma/ib_verbs.h> + +struct iwch_pd { + struct ib_pd ibpd; + u32 pdid; + struct iwch_dev *rhp; +}; + +#ifndef container_of +#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field))) +#endif +static __inline struct iwch_pd * +to_iwch_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct iwch_pd, ibpd); +} + +struct tpt_attributes { + u32 stag; + u32 state:1; + u32 type:2; + u32 rsvd:1; + enum tpt_mem_perm perms; + u32 remote_invaliate_disable:1; + u32 zbva:1; + u32 mw_bind_enable:1; + u32 page_size:5; + + u32 pdid; + u32 qpid; + u32 pbl_addr; + u32 len; + u64 va_fbo; + u32 pbl_size; +}; + +struct iwch_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +typedef struct iwch_mw iwch_mw_handle; + +static __inline struct iwch_mr * +to_iwch_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct iwch_mr, ibmr); +} + +struct iwch_mw { + struct ib_mw ibmw; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +static __inline struct iwch_mw * +to_iwch_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct iwch_mw, ibmw); +} + +struct iwch_cq { + struct ib_cq ibcq; + struct iwch_dev *rhp; + struct t3_cq cq; + struct mtx lock; + int refcnt; + u32 /* __user */ *user_rptr_addr; +}; + +static __inline struct iwch_cq * +to_iwch_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct iwch_cq, ibcq); +} + +enum IWCH_QP_FLAGS { + QP_QUIESCED = 0x01 +}; + +struct iwch_mpa_attributes { + u8 recv_marker_enabled; + u8 xmit_marker_enabled; /* iWARP: enable inbound Read Resp. */ + u8 crc_enabled; + u8 version; /* 0 or 1 */ +}; + +struct iwch_qp_attributes { + u32 scq; + u32 rcq; + u32 sq_num_entries; + u32 rq_num_entries; + u32 sq_max_sges; + u32 sq_max_sges_rdma_write; + u32 rq_max_sges; + u32 state; + u8 enable_rdma_read; + u8 enable_rdma_write; /* enable inbound Read Resp. */ + u8 enable_bind; + u8 enable_mmid0_fastreg; /* Enable STAG0 + Fast-register */ + /* + * Next QP state. If specify the current state, only the + * QP attributes will be modified. + */ + u32 max_ord; + u32 max_ird; + u32 pd; /* IN */ + u32 next_state; + char terminate_buffer[52]; + u32 terminate_msg_len; + u8 is_terminate_local; + struct iwch_mpa_attributes mpa_attr; /* IN-OUT */ + struct iwch_ep *llp_stream_handle; + char *stream_msg_buf; /* Last stream msg. before Idle -> RTS */ + u32 stream_msg_buf_len; /* Only on Idle -> RTS */ +}; + +struct iwch_qp { + struct ib_qp ibqp; + struct iwch_dev *rhp; + struct iwch_ep *ep; + struct iwch_qp_attributes attr; + struct t3_wq wq; + struct mtx lock; + int refcnt; + enum IWCH_QP_FLAGS flags; + struct callout timer; +}; + +static __inline int +qp_quiesced(struct iwch_qp *qhp) +{ + return qhp->flags & QP_QUIESCED; +} + +static __inline struct iwch_qp * +to_iwch_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct iwch_qp, ibqp); +} + +void iwch_qp_add_ref(struct ib_qp *qp); +void iwch_qp_rem_ref(struct ib_qp *qp); + +struct iwch_ucontext { + struct ib_ucontext ibucontext; + struct cxio_ucontext uctx; + u32 key; + struct mtx mmap_lock; + TAILQ_HEAD( ,iwch_mm_entry) mmaps; +}; + +static __inline struct iwch_ucontext * +to_iwch_ucontext(struct ib_ucontext *c) +{ + return container_of(c, struct iwch_ucontext, ibucontext); +} + +struct iwch_mm_entry { + TAILQ_ENTRY(iwch_mm_entry) entry; + u64 addr; + u32 key; + unsigned len; +}; + +static __inline struct iwch_mm_entry * +remove_mmap(struct iwch_ucontext *ucontext, + u32 key, unsigned len) +{ + struct iwch_mm_entry *tmp, *mm; + + mtx_lock(&ucontext->mmap_lock); + TAILQ_FOREACH_SAFE(mm, &ucontext->mmaps, entry, tmp) { + if (mm->key == key && mm->len == len) { + TAILQ_REMOVE(&ucontext->mmaps, mm, entry); + mtx_unlock(&ucontext->mmap_lock); + CTR4(KTR_IW_CXGB, "%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__, + key, (unsigned long long) mm->addr, mm->len); + return mm; + } + } + mtx_unlock(&ucontext->mmap_lock); + + return NULL; +} + +static __inline void +insert_mmap(struct iwch_ucontext *ucontext, + struct iwch_mm_entry *mm) +{ + mtx_lock(&ucontext->mmap_lock); + CTR4(KTR_IW_CXGB, "%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__, + mm->key, (unsigned long long) mm->addr, mm->len); + TAILQ_INSERT_TAIL(&ucontext->mmaps, mm, entry); + mtx_unlock(&ucontext->mmap_lock); +} + +enum iwch_qp_attr_mask { + IWCH_QP_ATTR_NEXT_STATE = 1 << 0, + IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7, + IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8, + IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9, + IWCH_QP_ATTR_MAX_ORD = 1 << 11, + IWCH_QP_ATTR_MAX_IRD = 1 << 12, + IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22, + IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23, + IWCH_QP_ATTR_MPA_ATTR = 1 << 24, + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25, + IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_MAX_ORD | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_STREAM_MSG_BUFFER | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE) +}; + +int iwch_modify_qp(struct iwch_dev *rhp, + struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal); + +enum iwch_qp_state { + IWCH_QP_STATE_IDLE, + IWCH_QP_STATE_RTS, + IWCH_QP_STATE_ERROR, + IWCH_QP_STATE_TERMINATE, + IWCH_QP_STATE_CLOSING, + IWCH_QP_STATE_TOT +}; + +static __inline int +iwch_convert_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + case IB_QPS_INIT: + return IWCH_QP_STATE_IDLE; + case IB_QPS_RTS: + return IWCH_QP_STATE_RTS; + case IB_QPS_SQD: + return IWCH_QP_STATE_CLOSING; + case IB_QPS_SQE: + return IWCH_QP_STATE_TERMINATE; + case IB_QPS_ERR: + return IWCH_QP_STATE_ERROR; + default: + return -1; + } +} + +static __inline u32 +iwch_ib_to_tpt_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? TPT_LOCAL_WRITE : 0) | + TPT_LOCAL_READ; +} + +static __inline u32 +iwch_ib_to_mwbind_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? T3_MEM_ACCESS_REM_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? T3_MEM_ACCESS_REM_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? T3_MEM_ACCESS_LOCAL_WRITE : 0) | + T3_MEM_ACCESS_LOCAL_READ; +} + +enum iwch_mmid_state { + IWCH_STAG_STATE_VALID, + IWCH_STAG_STATE_INVALID +}; + +enum iwch_qp_query_flags { + IWCH_QP_QUERY_CONTEXT_NONE = 0x0, /* No ctx; Only attrs */ + IWCH_QP_QUERY_CONTEXT_GET = 0x1, /* Get ctx + attrs */ + IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2, /* Not Supported */ + + /* + * Quiesce QP context; Consumer + * will NOT replay outstanding WR + */ + IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4, + IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8, + IWCH_QP_QUERY_TEST_USERWRITE = 0x32 /* Test special */ +}; + +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); +int iwch_register_device(struct iwch_dev *dev); +void iwch_unregister_device(struct iwch_dev *dev); +int iwch_quiesce_qps(struct iwch_cq *chp); +int iwch_resume_qps(struct iwch_cq *chp); +void stop_read_rep_timer(struct iwch_qp *qhp); +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list); +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list, + int npages); +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list); + + +#define IWCH_NODE_DESC "cxgb3 Chelsio Communications" + +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c new file mode 100644 index 0000000000000..3c203f1b43375 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_qp.c @@ -0,0 +1,1052 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/queue.h> + +#include <netinet/in.h> + +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/ib_umem.h> +#include <contrib/rdma/ib_user_verbs.h> + + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#include <ulp/iw_cxgb/iw_cxgb_resource.h> +#include <ulp/iw_cxgb/iw_cxgb_user.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h> +#endif + + +#define NO_SUPPORT -1 + +static int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, + u8 * flit_cnt) +{ + int i; + u32 plen; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.rdmaop = T3_SEND_WITH_SE; + else + wqe->send.rdmaop = T3_SEND; + wqe->send.rem_stag = 0; + break; +#if 0 /* Not currently supported */ + case TYPE_SEND_INVALIDATE: + case TYPE_SEND_INVALIDATE_IMMEDIATE: + wqe->send.rdmaop = T3_SEND_WITH_INV; + wqe->send.rem_stag = htobe32(wr->wr.rdma.rkey); + break; + case TYPE_SEND_SE_INVALIDATE: + wqe->send.rdmaop = T3_SEND_WITH_SE_INV; + wqe->send.rem_stag = htobe32(wr->wr.rdma.rkey); + break; +#endif + default: + break; + } + if (wr->num_sge > T3_MAX_SGE) + return (-EINVAL); + wqe->send.reserved[0] = 0; + wqe->send.reserved[1] = 0; + wqe->send.reserved[2] = 0; + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + plen = 4; + wqe->send.sgl[0].stag = wr->imm_data; + wqe->send.sgl[0].len = 0; + wqe->send.num_sgle = 0; + *flit_cnt = 5; + } else { + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) { + return (-EMSGSIZE); + } + plen += wr->sg_list[i].length; + wqe->send.sgl[i].stag = + htobe32(wr->sg_list[i].lkey); + wqe->send.sgl[i].len = + htobe32(wr->sg_list[i].length); + wqe->send.sgl[i].to = htobe64(wr->sg_list[i].addr); + } + wqe->send.num_sgle = htobe32(wr->num_sge); + *flit_cnt = 4 + ((wr->num_sge) << 1); + } + wqe->send.plen = htobe32(plen); + return 0; +} + +static int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + int i; + u32 plen; + + if (wr->num_sge > T3_MAX_SGE) + return (-EINVAL); + wqe->write.rdmaop = T3_RDMA_WRITE; + wqe->write.reserved[0] = 0; + wqe->write.reserved[1] = 0; + wqe->write.reserved[2] = 0; + wqe->write.stag_sink = htobe32(wr->wr.rdma.rkey); + wqe->write.to_sink = htobe64(wr->wr.rdma.remote_addr); + + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + plen = 4; + wqe->write.sgl[0].stag = wr->imm_data; + wqe->write.sgl[0].len = 0; + wqe->write.num_sgle = 0; + *flit_cnt = 6; + } else { + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) { + return (-EMSGSIZE); + } + plen += wr->sg_list[i].length; + wqe->write.sgl[i].stag = + htobe32(wr->sg_list[i].lkey); + wqe->write.sgl[i].len = + htobe32(wr->sg_list[i].length); + wqe->write.sgl[i].to = + htobe64(wr->sg_list[i].addr); + } + wqe->write.num_sgle = htobe32(wr->num_sge); + *flit_cnt = 5 + ((wr->num_sge) << 1); + } + wqe->write.plen = htobe32(plen); + return 0; +} + +static int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + if (wr->num_sge > 1) + return (-EINVAL); + wqe->read.rdmaop = T3_READ_REQ; + wqe->read.reserved[0] = 0; + wqe->read.reserved[1] = 0; + wqe->read.reserved[2] = 0; + wqe->read.rem_stag = htobe32(wr->wr.rdma.rkey); + wqe->read.rem_to = htobe64(wr->wr.rdma.remote_addr); + wqe->read.local_stag = htobe32(wr->sg_list[0].lkey); + wqe->read.local_len = htobe32(wr->sg_list[0].length); + wqe->read.local_to = htobe64(wr->sg_list[0].addr); + *flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3; + return 0; +} + +/* + * TBD: this is going to be moved to firmware. Missing pdid/qpid check for now. + */ +static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list, + u32 num_sgle, u32 * pbl_addr, u8 * page_size) +{ + int i; + struct iwch_mr *mhp; + u32 offset; + for (i = 0; i < num_sgle; i++) { + + mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8); + if (!mhp) { + CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__); + return (-EIO); + } + if (!mhp->attr.state) { + CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__); + return (-EIO); + } + if (mhp->attr.zbva) { + CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__); + return (-EIO); + } + + if (sg_list[i].addr < mhp->attr.va_fbo) { + CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__); + return (-EINVAL); + } + if (sg_list[i].addr + ((u64) sg_list[i].length) < + sg_list[i].addr) { + CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__); + return (-EINVAL); + } + if (sg_list[i].addr + ((u64) sg_list[i].length) > + mhp->attr.va_fbo + ((u64) mhp->attr.len)) { + CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__); + return (-EINVAL); + } + offset = sg_list[i].addr - mhp->attr.va_fbo; + offset += ((u32) mhp->attr.va_fbo) % + (1UL << (12 + mhp->attr.page_size)); + pbl_addr[i] = ((mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3) + + (offset >> (12 + mhp->attr.page_size)); + page_size[i] = mhp->attr.page_size; + } + return 0; +} + +static int iwch_build_rdma_recv(struct iwch_dev *rhp, union t3_wr *wqe, + struct ib_recv_wr *wr) +{ + int i; + if (wr->num_sge > T3_MAX_SGE) + return (-EINVAL); + wqe->recv.num_sgle = htobe32(wr->num_sge); + for (i = 0; i < wr->num_sge; i++) { + wqe->recv.sgl[i].stag = htobe32(wr->sg_list[i].lkey); + wqe->recv.sgl[i].len = htobe32(wr->sg_list[i].length); + wqe->recv.sgl[i].to = htobe64(wr->sg_list[i].addr); + } + for (; i < T3_MAX_SGE; i++) { + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = 0; + wqe->recv.sgl[i].to = 0; + } + return 0; +} + +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int err = 0; + u8 t3_wr_flit_cnt = 0; + enum t3_wr_opcode t3_wr_opcode = 0; + enum t3_wr_flags t3_wr_flags; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + struct t3_swsq *sqp; + + qhp = to_iwch_qp(ibqp); + mtx_lock(&qhp->lock); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + mtx_unlock(&qhp->lock); + return (-EINVAL); + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if (num_wrs <= 0) { + mtx_unlock(&qhp->lock); + return (-ENOMEM); + } + while (wr) { + if (num_wrs == 0) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + t3_wr_flags = 0; + if (wr->send_flags & IB_SEND_SOLICITED) + t3_wr_flags |= T3_SOLICITED_EVENT_FLAG; + if (wr->send_flags & IB_SEND_FENCE) + t3_wr_flags |= T3_READ_FENCE_FLAG; + if (wr->send_flags & IB_SEND_SIGNALED) + t3_wr_flags |= T3_COMPLETION_FLAG; + sqp = qhp->wq.sq + + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + t3_wr_opcode = T3_WR_SEND; + err = iwch_build_rdma_send(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + t3_wr_opcode = T3_WR_WRITE; + err = iwch_build_rdma_write(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_READ: + t3_wr_opcode = T3_WR_READ; + t3_wr_flags = 0; /* T3 reads are always signaled */ + err = iwch_build_rdma_read(wqe, wr, &t3_wr_flit_cnt); + if (err) + break; + sqp->read_len = wqe->read.local_len; + if (!qhp->wq.oldest_read) + qhp->wq.oldest_read = sqp; + break; + default: + CTR2(KTR_IW_CXGB, "%s post of type=%d TBD!", __FUNCTION__, + wr->opcode); + err = -EINVAL; + } + if (err) { + *bad_wr = wr; + break; + } + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp->wr_id = wr->wr_id; + sqp->opcode = wr2opcode(t3_wr_opcode); + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED); + + build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, t3_wr_flit_cnt); + CTR5(KTR_IW_CXGB, "%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d", + __FUNCTION__, (unsigned long long) wr->wr_id, idx, + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2), + sqp->opcode); + wr = wr->next; + num_wrs--; + ++(qhp->wq.wptr); + ++(qhp->wq.sq_wptr); + } + mtx_unlock(&qhp->lock); + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + return err; +} + +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + + qhp = to_iwch_qp(ibqp); + mtx_lock(&qhp->lock); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + mtx_unlock(&qhp->lock); + return (-EINVAL); + } + num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr, + qhp->wq.rq_size_log2) - 1; + if (!wr) { + mtx_unlock(&qhp->lock); + return (-EINVAL); + } + while (wr) { + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + if (num_wrs) + err = iwch_build_rdma_recv(qhp->rhp, wqe, wr); + else + err = -ENOMEM; + if (err) { + *bad_wr = wr; + break; + } + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, qhp->wq.rq_size_log2)] = + wr->wr_id; + build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, sizeof(struct t3_receive_wr) >> 3); + CTR6(KTR_IW_CXGB, "%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x " + "wqe %p ", __FUNCTION__, (unsigned long long) wr->wr_id, + idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe); + ++(qhp->wq.rq_wptr); + ++(qhp->wq.wptr); + wr = wr->next; + num_wrs--; + } + mtx_unlock(&qhp->lock); + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + return err; +} + +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + struct iwch_qp *qhp; + union t3_wr *wqe; + u32 pbl_addr; + u8 page_size; + u32 num_wrs; + struct ib_sge sgl; + int err=0; + enum t3_wr_flags t3_wr_flags; + u32 idx; + struct t3_swsq *sqp; + + qhp = to_iwch_qp(qp); + mhp = to_iwch_mw(mw); + rhp = qhp->rhp; + + mtx_lock(&qhp->lock); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + mtx_unlock(&qhp->lock); + return (-EINVAL); + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if ((num_wrs) <= 0) { + mtx_unlock(&qhp->lock); + return (-ENOMEM); + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + CTR4(KTR_IW_CXGB, "%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p", __FUNCTION__, idx, + mw, mw_bind); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + + t3_wr_flags = 0; + if (mw_bind->send_flags & IB_SEND_SIGNALED) + t3_wr_flags = T3_COMPLETION_FLAG; + + sgl.addr = mw_bind->addr; + sgl.lkey = mw_bind->mr->lkey; + sgl.length = mw_bind->length; + wqe->bind.reserved = 0; + wqe->bind.type = T3_VA_BASED_TO; + + /* TBD: check perms */ + wqe->bind.perms = iwch_ib_to_mwbind_access(mw_bind->mw_access_flags); + wqe->bind.mr_stag = htobe32(mw_bind->mr->lkey); + wqe->bind.mw_stag = htobe32(mw->rkey); + wqe->bind.mw_len = htobe32(mw_bind->length); + wqe->bind.mw_va = htobe64(mw_bind->addr); + err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size); + if (err) { + mtx_unlock(&qhp->lock); + return (err); + } + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + sqp->wr_id = mw_bind->wr_id; + sqp->opcode = T3_BIND_MW; + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED); + wqe->bind.mr_pbl_addr = htobe32(pbl_addr); + wqe->bind.mr_pagesz = page_size; + wqe->flit[T3_SQ_COOKIE_FLIT] = mw_bind->wr_id; + build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0, + sizeof(struct t3_bind_mw_wr) >> 3); + ++(qhp->wq.wptr); + ++(qhp->wq.sq_wptr); + mtx_unlock(&qhp->lock); + + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + + return err; +} + +static inline void build_term_codes(struct respQ_msg_t *rsp_msg, + u8 *layer_type, u8 *ecode) +{ + int status = TPT_ERR_INTERNAL_ERR; + int tagged = 0; + int opcode = -1; + int rqtype = 0; + int send_inv = 0; + + if (rsp_msg) { + status = CQE_STATUS(rsp_msg->cqe); + opcode = CQE_OPCODE(rsp_msg->cqe); + rqtype = RQ_TYPE(rsp_msg->cqe); + send_inv = (opcode == T3_SEND_WITH_INV) || + (opcode == T3_SEND_WITH_SE_INV); + tagged = (opcode == T3_RDMA_WRITE) || + (rqtype && (opcode == T3_READ_RESP)); + } + + switch (status) { + case TPT_ERR_STAG: + if (send_inv) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_INV_STAG; + } + break; + case TPT_ERR_PDID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + if ((opcode == T3_SEND_WITH_INV) || + (opcode == T3_SEND_WITH_SE_INV)) + *ecode = RDMAP_CANT_INV_STAG; + else + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case TPT_ERR_QPID: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_STAG_NOT_ASSOC; + break; + case TPT_ERR_ACCESS: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_ACC_VIOL; + break; + case TPT_ERR_WRAP: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_TO_WRAP; + break; + case TPT_ERR_BOUND: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + } else { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_BASE_BOUNDS; + } + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + break; + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_OUT_OF_RQE: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_NOBUF; + break; + case TPT_ERR_PBL_ADDR_BOUND: + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + break; + case TPT_ERR_CRC: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_CRC_ERR; + break; + case TPT_ERR_MARKER: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_MARKER_ERR; + break; + case TPT_ERR_PDU_LEN_ERR: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_MSG_TOOBIG; + break; + case TPT_ERR_DDP_VERSION: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_INV_VERS; + } else { + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_VERS; + } + break; + case TPT_ERR_RDMA_VERSION: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_VERS; + break; + case TPT_ERR_OPCODE: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_OPCODE; + break; + case TPT_ERR_DDP_QUEUE_NUM: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_QN; + break; + case TPT_ERR_MSN: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_RANGE; + break; + case TPT_ERR_TBIT: + *layer_type = LAYER_DDP|DDP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_MO: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MO; + break; + default: + *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; + *ecode = 0; + break; + } +} + +/* + * This posts a TERMINATE with layer=RDMA, type=catastrophic. + */ +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) +{ + union t3_wr *wqe; + struct terminate_message *term; + struct mbuf *m; + + CTR2(KTR_IW_CXGB, "%s %d", __FUNCTION__, __LINE__); + m = m_gethdr(MT_DATA, M_NOWAIT); + if (!m) { + log(LOG_ERR, "%s cannot send TERMINATE!\n", __FUNCTION__); + return (-ENOMEM); + } + wqe = mtod(m, union t3_wr *); + m->m_len = m->m_pkthdr.len = 40; + memset(wqe, 0, 40); + wqe->send.rdmaop = T3_TERMINATE; + + /* immediate data length */ + wqe->send.plen = htonl(4); + + /* immediate data starts here. */ + term = (struct terminate_message *)wqe->send.sgl; + build_term_codes(rsp_msg, &term->layer_etype, &term->ecode); + wqe->send.wrh.op_seop_flags = htobe32(V_FW_RIWR_OP(T3_WR_SEND) | + V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG)); + wqe->send.wrh.gen_tid_len = htobe32(V_FW_RIWR_TID(qhp->ep->hwtid)); + + m_set_priority(m, CPL_PRIORITY_DATA); + m_set_sgl(m, NULL); + m_set_sgllen(m, 0); + return cxgb_ofld_send(qhp->rhp->rdev.t3cdev_p, m); +} + +/* + * Assumes qhp lock is held. + */ +static void __flush_qp(struct iwch_qp *qhp) +{ + struct iwch_cq *rchp, *schp; + int count; + + rchp = get_chp(qhp->rhp, qhp->attr.rcq); + schp = get_chp(qhp->rhp, qhp->attr.scq); + + CTR4(KTR_IW_CXGB, "%s qhp %p rchp %p schp %p", __FUNCTION__, qhp, rchp, schp); + /* take a ref on the qhp since we must release the lock */ + qhp->refcnt++; + mtx_unlock(&qhp->lock); + + /* locking heirarchy: cq lock first, then qp lock. */ + mtx_lock(&rchp->lock); + mtx_lock(&qhp->lock); + cxio_flush_hw_cq(&rchp->cq); + cxio_count_rcqes(&rchp->cq, &qhp->wq, &count); + cxio_flush_rq(&qhp->wq, &rchp->cq, count); + mtx_unlock(&qhp->lock); + mtx_unlock(&rchp->lock); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + + /* locking heirarchy: cq lock first, then qp lock. */ + mtx_lock(&schp->lock); + mtx_lock(&qhp->lock); + cxio_flush_hw_cq(&schp->cq); + cxio_count_scqes(&schp->cq, &qhp->wq, &count); + cxio_flush_sq(&qhp->wq, &schp->cq, count); + mtx_unlock(&qhp->lock); + mtx_unlock(&schp->lock); + (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); + + /* deref */ + mtx_lock(&qhp->lock); + if (--qhp->refcnt == 0) + wakeup(qhp); +} + +static void flush_qp(struct iwch_qp *qhp) +{ + if (qhp->ibqp.uobject) + cxio_set_wq_in_error(&qhp->wq); + else + __flush_qp(qhp); +} + + +/* + * Return non zero if at least one RECV was pre-posted. + */ +static int rqes_posted(struct iwch_qp *qhp) +{ + return fw_riwrh_opcode((struct fw_riwrh *)qhp->wq.queue) == T3_WR_RCV; +} + +static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs) +{ + struct t3_rdma_init_attr init_attr; + int ret; + + init_attr.tid = qhp->ep->hwtid; + init_attr.qpid = qhp->wq.qpid; + init_attr.pdid = qhp->attr.pd; + init_attr.scqid = qhp->attr.scq; + init_attr.rcqid = qhp->attr.rcq; + init_attr.rq_addr = qhp->wq.rq_addr; + init_attr.rq_size = 1 << qhp->wq.rq_size_log2; + init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE | + qhp->attr.mpa_attr.recv_marker_enabled | + (qhp->attr.mpa_attr.xmit_marker_enabled << 1) | + (qhp->attr.mpa_attr.crc_enabled << 2); + + /* + * XXX - The IWCM doesn't quite handle getting these + * attrs set before going into RTS. For now, just turn + * them on always... + */ +#if 0 + init_attr.qpcaps = qhp->attr.enableRdmaRead | + (qhp->attr.enableRdmaWrite << 1) | + (qhp->attr.enableBind << 2) | + (qhp->attr.enable_stag0_fastreg << 3) | + (qhp->attr.enable_stag0_fastreg << 4); +#else + init_attr.qpcaps = 0x1f; +#endif + init_attr.tcp_emss = qhp->ep->emss; + init_attr.ord = qhp->attr.max_ord; + init_attr.ird = qhp->attr.max_ird; + init_attr.qp_dma_addr = qhp->wq.dma_addr; + init_attr.qp_dma_size = (1UL << qhp->wq.size_log2); + init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0; + init_attr.irs = qhp->ep->rcv_seq; + CTR5(KTR_IW_CXGB, "%s init_attr.rq_addr 0x%x init_attr.rq_size = %d " + "flags 0x%x qpcaps 0x%x", __FUNCTION__, + init_attr.rq_addr, init_attr.rq_size, + init_attr.flags, init_attr.qpcaps); + ret = cxio_rdma_init(&rhp->rdev, &init_attr); + CTR2(KTR_IW_CXGB, "%s ret %d", __FUNCTION__, ret); + return ret; +} + +int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal) +{ + int ret = 0; + struct iwch_qp_attributes newattr = qhp->attr; + int disconnect = 0; + int terminate = 0; + int abort = 0; + int free = 0; + struct iwch_ep *ep = NULL; + + CTR6(KTR_IW_CXGB, "%s qhp %p qpid 0x%x ep %p state %d -> %d", __FUNCTION__, + qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state, + (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); + + mtx_lock(&qhp->lock); + + /* Process attr changes if in IDLE */ + if (mask & IWCH_QP_ATTR_VALID_MODIFY) { + if (qhp->attr.state != IWCH_QP_STATE_IDLE) { + ret = -EIO; + goto out; + } + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ) + newattr.enable_rdma_read = attrs->enable_rdma_read; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE) + newattr.enable_rdma_write = attrs->enable_rdma_write; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND) + newattr.enable_bind = attrs->enable_bind; + if (mask & IWCH_QP_ATTR_MAX_ORD) { + if (attrs->max_ord > + rhp->attr.max_rdma_read_qp_depth) { + ret = -EINVAL; + goto out; + } + newattr.max_ord = attrs->max_ord; + } + if (mask & IWCH_QP_ATTR_MAX_IRD) { + if (attrs->max_ird > + rhp->attr.max_rdma_reads_per_qp) { + ret = -EINVAL; + goto out; + } + newattr.max_ird = attrs->max_ird; + } + qhp->attr = newattr; + } + + if (!(mask & IWCH_QP_ATTR_NEXT_STATE)) + goto out; + if (qhp->attr.state == attrs->next_state) + goto out; + + switch (qhp->attr.state) { + case IWCH_QP_STATE_IDLE: + switch (attrs->next_state) { + case IWCH_QP_STATE_RTS: + if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) { + ret = -EINVAL; + goto out; + } + if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) { + ret = -EINVAL; + goto out; + } + qhp->attr.mpa_attr = attrs->mpa_attr; + qhp->attr.llp_stream_handle = attrs->llp_stream_handle; + qhp->ep = qhp->attr.llp_stream_handle; + qhp->attr.state = IWCH_QP_STATE_RTS; + + /* + * Ref the endpoint here and deref when we + * disassociate the endpoint from the QP. This + * happens in CLOSING->IDLE transition or *->ERROR + * transition. + */ + get_ep(&qhp->ep->com); + mtx_unlock(&qhp->lock); + ret = rdma_init(rhp, qhp, mask, attrs); + mtx_lock(&qhp->lock); + if (ret) + goto err; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + flush_qp(qhp); + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_RTS: + switch (attrs->next_state) { + case IWCH_QP_STATE_CLOSING: + PANIC_IF(atomic_load_acq_int(&qhp->ep->com.refcount) < 2); + qhp->attr.state = IWCH_QP_STATE_CLOSING; + if (!internal) { + abort=0; + disconnect = 1; + ep = qhp->ep; + } + flush_qp(qhp); + break; + case IWCH_QP_STATE_TERMINATE: + qhp->attr.state = IWCH_QP_STATE_TERMINATE; + if (qhp->ibqp.uobject) + cxio_set_wq_in_error(&qhp->wq); + if (!internal) + terminate = 1; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + if (!internal) { + abort=1; + disconnect = 1; + ep = qhp->ep; + } + goto err; + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_CLOSING: + if (!internal) { + ret = -EINVAL; + goto out; + } + switch (attrs->next_state) { + case IWCH_QP_STATE_IDLE: + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.llp_stream_handle = NULL; + put_ep(&qhp->ep->com); + qhp->ep = NULL; + wakeup(qhp); + break; + case IWCH_QP_STATE_ERROR: + disconnect=1; + goto err; + default: + ret = -EINVAL; + goto err; + } + break; + case IWCH_QP_STATE_ERROR: + if (attrs->next_state != IWCH_QP_STATE_IDLE) { + ret = -EINVAL; + goto out; + } + + if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) || + !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) { + ret = -EINVAL; + goto out; + } + qhp->attr.state = IWCH_QP_STATE_IDLE; + memset(&qhp->attr, 0, sizeof(qhp->attr)); + break; + case IWCH_QP_STATE_TERMINATE: + if (!internal) { + ret = -EINVAL; + goto out; + } + goto err; + break; + default: + log(LOG_ERR, "%s in a bad state %d\n", + __FUNCTION__, qhp->attr.state); + ret = -EINVAL; + goto err; + break; + } + goto out; +err: + CTR3(KTR_IW_CXGB, "%s disassociating ep %p qpid 0x%x", __FUNCTION__, qhp->ep, + qhp->wq.qpid); + + /* disassociate the LLP connection */ + qhp->attr.llp_stream_handle = NULL; + ep = qhp->ep; + qhp->ep = NULL; + qhp->attr.state = IWCH_QP_STATE_ERROR; + free=1; + wakeup(qhp); + PANIC_IF(!ep); + flush_qp(qhp); +out: + mtx_unlock(&qhp->lock); + + if (terminate) + iwch_post_terminate(qhp, NULL); + + /* + * If disconnect is 1, then we need to initiate a disconnect + * on the EP. This can be a normal close (RTS->CLOSING) or + * an abnormal close (RTS/CLOSING->ERROR). + */ + if (disconnect) + iwch_ep_disconnect(ep, abort, M_NOWAIT); + + /* + * If free is 1, then we've disassociated the EP from the QP + * and we need to dereference the EP. + */ + if (free) + put_ep(&ep->com); + + CTR2(KTR_IW_CXGB, "%s exit state %d", __FUNCTION__, qhp->attr.state); + return ret; +} + +static int quiesce_qp(struct iwch_qp *qhp) +{ + mtx_lock(&qhp->lock); + iwch_quiesce_tid(qhp->ep); + qhp->flags |= QP_QUIESCED; + mtx_unlock(&qhp->lock); + return 0; +} + +static int resume_qp(struct iwch_qp *qhp) +{ + mtx_lock(&qhp->lock); + iwch_resume_tid(qhp->ep); + qhp->flags &= ~QP_QUIESCED; + mtx_lock(&qhp->lock); + return 0; +} + +int iwch_quiesce_qps(struct iwch_cq *chp) +{ + int i; + struct iwch_qp *qhp; + + for (i=0; i < T3_MAX_NUM_QP; i++) { + qhp = get_qhp(chp->rhp, i); + if (!qhp) + continue; + if ((qhp->attr.rcq == chp->cq.cqid) && !qp_quiesced(qhp)) { + quiesce_qp(qhp); + continue; + } + if ((qhp->attr.scq == chp->cq.cqid) && !qp_quiesced(qhp)) + quiesce_qp(qhp); + } + return 0; +} + +int iwch_resume_qps(struct iwch_cq *chp) +{ + int i; + struct iwch_qp *qhp; + + for (i=0; i < T3_MAX_NUM_QP; i++) { + qhp = get_qhp(chp->rhp, i); + if (!qhp) + continue; + if ((qhp->attr.rcq == chp->cq.cqid) && qp_quiesced(qhp)) { + resume_qp(qhp); + continue; + } + if ((qhp->attr.scq == chp->cq.cqid) && qp_quiesced(qhp)) + resume_qp(qhp); + } + return 0; +} + diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c new file mode 100644 index 0000000000000..9d3618fbd57f0 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.c @@ -0,0 +1,382 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/pciio.h> +#include <sys/conf.h> +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus_dma.h> +#include <sys/rman.h> +#include <sys/ioccom.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/linker.h> +#include <sys/firmware.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/smp.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/proc.h> +#include <sys/queue.h> +#include <sys/libkern.h> + +#include <netinet/in.h> + +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/ib_umem.h> +#include <contrib/rdma/ib_user_verbs.h> + + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#include <ulp/iw_cxgb/iw_cxgb_wr.h> +#include <ulp/iw_cxgb/iw_cxgb_hal.h> +#include <ulp/iw_cxgb/iw_cxgb_provider.h> +#include <ulp/iw_cxgb/iw_cxgb_cm.h> +#include <ulp/iw_cxgb/iw_cxgb.h> +#include <ulp/iw_cxgb/iw_cxgb_resource.h> +#include <ulp/iw_cxgb/iw_cxgb_user.h> +#else +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_hal.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_provider.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_cm.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h> +#include <dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h> +#endif + +#ifdef needed +static struct buf_ring *rhdl_fifo; +static struct mtx rhdl_fifo_lock; +#endif + +#define RANDOM_SIZE 16 + +static int __cxio_init_resource_fifo(struct buf_ring **fifo, + struct mtx *fifo_lock, + u32 nr, u32 skip_low, + u32 skip_high, + int randomize) +{ + u32 i, j, idx; + u32 random_bytes; + u32 rarray[16]; + mtx_init(fifo_lock, "cxio fifo", NULL, MTX_DEF|MTX_DUPOK); + + *fifo = buf_ring_alloc(nr, M_NOWAIT); + if (*fifo == NULL) + return (-ENOMEM); +#if 0 + for (i = 0; i < skip_low + skip_high; i++) { + u32 entry = 0; + + buf_ring_enqueue(*fifo, (uintptr_t) entry); + } +#endif + if (randomize) { + j = 0; + random_bytes = random(); + for (i = 0; i < RANDOM_SIZE; i++) + rarray[i] = i + skip_low; + for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { + if (j >= RANDOM_SIZE) { + j = 0; + random_bytes = random(); + } + idx = (random_bytes >> (j * 2)) & 0xF; + buf_ring_enqueue(*fifo, (void *)(uintptr_t)rarray[idx]); + rarray[idx] = i; + j++; + } + for (i = 0; i < RANDOM_SIZE; i++) + buf_ring_enqueue(*fifo, (void *) (uintptr_t)rarray[i]); + } else + for (i = skip_low; i < nr - skip_high; i++) + buf_ring_enqueue(*fifo, (void *) (uintptr_t)i); +#if 0 + for (i = 0; i < skip_low + skip_high; i++) + buf_ring_dequeue(*fifo); +#endif + return 0; +} + +static int cxio_init_resource_fifo(struct buf_ring **fifo, struct mtx * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 0)); +} + +static int cxio_init_resource_fifo_random(struct buf_ring **fifo, + struct mtx * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 1)); +} + +static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) +{ + u32 i; + + mtx_init(&rdev_p->rscp->qpid_fifo_lock, "qpid fifo", NULL, MTX_DEF); + + rdev_p->rscp->qpid_fifo = buf_ring_alloc(T3_MAX_NUM_QP, M_NOWAIT); + if (rdev_p->rscp->qpid_fifo == NULL) + return (-ENOMEM); + + for (i = 16; i < T3_MAX_NUM_QP; i++) + if (!(i & rdev_p->qpmask)) + buf_ring_enqueue(rdev_p->rscp->qpid_fifo, (void *) (uintptr_t)i); + return 0; +} + +#ifdef needed +int cxio_hal_init_rhdl_resource(u32 nr_rhdl) +{ + return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1, + 0); +} + +void cxio_hal_destroy_rhdl_resource(void) +{ + buf_ring_free(rhdl_fifo); +} +#endif + +/* nr_* must be power of 2 */ +int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid) +{ + int err = 0; + struct cxio_hal_resource *rscp; + + rscp = malloc(sizeof(*rscp), M_DEVBUF, M_NOWAIT|M_ZERO); + if (!rscp) + return (-ENOMEM); + rdev_p->rscp = rscp; + err = cxio_init_resource_fifo_random(&rscp->tpt_fifo, + &rscp->tpt_fifo_lock, + nr_tpt, 1, 0); + if (err) + goto tpt_err; + err = cxio_init_qpid_fifo(rdev_p); + if (err) + goto qpid_err; + err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, + nr_cqid, 1, 0); + if (err) + goto cqid_err; + err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, + nr_pdid, 1, 0); + if (err) + goto pdid_err; + return 0; +pdid_err: + buf_ring_free(rscp->cqid_fifo); +cqid_err: + buf_ring_free(rscp->qpid_fifo); +qpid_err: + buf_ring_free(rscp->tpt_fifo); +tpt_err: + return (-ENOMEM); +} + +/* + * returns 0 if no resource available + */ +static u32 cxio_hal_get_resource(struct buf_ring *fifo, struct mtx *lock) +{ + u32 entry; + + mtx_lock(lock); + entry = (u32)(uintptr_t)buf_ring_dequeue(fifo); + mtx_unlock(lock); + return entry; +} + +static void cxio_hal_put_resource(struct buf_ring *fifo, u32 entry, struct mtx *lock) +{ + mtx_lock(lock); + buf_ring_enqueue(fifo, (void *) (uintptr_t)entry); + mtx_unlock(lock); +} + +u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(rscp->tpt_fifo, &rscp->tpt_fifo_lock); +} + +void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag) +{ + cxio_hal_put_resource(rscp->tpt_fifo, stag, &rscp->tpt_fifo_lock); +} + +u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp) +{ + u32 qpid = cxio_hal_get_resource(rscp->qpid_fifo, &rscp->qpid_fifo_lock); + CTR2(KTR_IW_CXGB, "%s qpid 0x%x", __FUNCTION__, qpid); + return qpid; +} + +void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid) +{ + CTR2(KTR_IW_CXGB, "%s qpid 0x%x", __FUNCTION__, qpid); + cxio_hal_put_resource(rscp->qpid_fifo, qpid, &rscp->qpid_fifo_lock); +} + +u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(rscp->cqid_fifo, &rscp->cqid_fifo_lock); +} + +void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid) +{ + cxio_hal_put_resource(rscp->cqid_fifo, cqid, &rscp->cqid_fifo_lock); +} + +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(rscp->pdid_fifo, &rscp->pdid_fifo_lock); +} + +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid) +{ + cxio_hal_put_resource(rscp->pdid_fifo, pdid, &rscp->pdid_fifo_lock); +} + +void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) +{ + buf_ring_free(rscp->tpt_fifo); + buf_ring_free(rscp->cqid_fifo); + buf_ring_free(rscp->qpid_fifo); + buf_ring_free(rscp->pdid_fifo); + free(rscp, M_DEVBUF); +} + +/* + * PBL Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_PBL_SHIFT 8 /* 256B == min PBL size (32 entries) */ +#define PBL_CHUNK 2*1024*1024 + +u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size); + CTR3(KTR_IW_CXGB, "%s addr 0x%x size %d", __FUNCTION__, (u32)addr, size); + return (u32)addr; +} + +void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + CTR3(KTR_IW_CXGB, "%s addr 0x%x size %d", __FUNCTION__, addr, size); + gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size); +} + +int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p) +{ + + rdev_p->pbl_pool = gen_pool_create(rdev_p->rnic_info.pbl_base, MIN_PBL_SHIFT, + rdev_p->rnic_info.pbl_top - rdev_p->rnic_info.pbl_base); +#if 0 + if (rdev_p->pbl_pool) { + + unsigned long i; + for (i = rdev_p->rnic_info.pbl_base; + i <= rdev_p->rnic_info.pbl_top - PBL_CHUNK + 1; + i += PBL_CHUNK) + gen_pool_add(rdev_p->pbl_pool, i, PBL_CHUNK, -1); + } +#endif + return rdev_p->pbl_pool ? 0 : (-ENOMEM); +} + +void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->pbl_pool); +} + +/* + * RQT Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_RQT_SHIFT 10 /* 1KB == mini RQT size (16 entries) */ +#define RQT_CHUNK 2*1024*1024 + +u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6); + CTR3(KTR_IW_CXGB, "%s addr 0x%x size %d", __FUNCTION__, (u32)addr, size << 6); + return (u32)addr; +} + +void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + CTR3(KTR_IW_CXGB, "%s addr 0x%x size %d", __FUNCTION__, addr, size << 6); + gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6); +} + +int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p) +{ + + rdev_p->rqt_pool = gen_pool_create(rdev_p->rnic_info.rqt_base, + MIN_RQT_SHIFT, rdev_p->rnic_info.rqt_top - rdev_p->rnic_info.rqt_base); +#if 0 + if (rdev_p->rqt_pool) { + unsigned long i; + + for (i = rdev_p->rnic_info.rqt_base; + i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1; + i += RQT_CHUNK) + gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1); + } +#endif + return rdev_p->rqt_pool ? 0 : (-ENOMEM); +} + +void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->rqt_pool); +} diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h new file mode 100644 index 0000000000000..e0282a3453028 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_resource.h @@ -0,0 +1,59 @@ +/************************************************************************** + +Copyright (c) 2007, 2008 Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ +#ifndef __CXIO_RESOURCE_H__ +#define __CXIO_RESOURCE_H__ + +extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl); +extern void cxio_hal_destroy_rhdl_resource(void); +extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, + u32 nr_pdid); +extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag); +extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid); +extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid); +extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp); + +#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base ) +extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); + +#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base ) +extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h new file mode 100644 index 0000000000000..3086a6340a115 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_user.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2007, 2008 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $FreeBSD$ + */ +#ifndef __IWCH_USER_H__ +#define __IWCH_USER_H__ + +#define IWCH_UVERBS_ABI_VERSION 1 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in uint64_t + * instead. + */ +struct iwch_create_cq_req { + uint64_t user_rptr_addr; +}; + +struct iwch_create_cq_resp { + uint64_t key; + uint32_t cqid; + uint32_t size_log2; +}; + +struct iwch_create_qp_resp { + uint64_t key; + uint64_t db_key; + uint32_t qpid; + uint32_t size_log2; + uint32_t sq_size_log2; + uint32_t rq_size_log2; +}; + +struct iwch_reg_user_mr_resp { + uint32_t pbl_addr; +}; +#endif diff --git a/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h new file mode 100644 index 0000000000000..bf8f2d609e041 --- /dev/null +++ b/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb_wr.h @@ -0,0 +1,684 @@ +/************************************************************************** + +Copyright (c) 2007, 2008 Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ +#ifndef __CXIO_WR_H__ +#define __CXIO_WR_H__ +#define T3_MAX_SGE 4 +#define T3_MAX_INLINE 64 + +#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr)) +#define Q_FULL(rptr,wptr,size_log2) ( (((wptr)-(rptr))>>(size_log2)) && \ + ((rptr)!=(wptr)) ) +#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1)) +#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<<size_log2)-((wptr)-(rptr))) +#define Q_COUNT(rptr,wptr) ((wptr)-(rptr)) +#define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1)) + +static __inline void +ring_doorbell(void /* __iomem */ *doorbell, u32 qpid) +{ + writel(doorbell, ((1<<31) | qpid)); +} + +#define SEQ32_GE(x,y) (!( (((u32) (x)) - ((u32) (y))) & 0x80000000 )) + +enum t3_wr_flags { + T3_COMPLETION_FLAG = 0x01, + T3_NOTIFY_FLAG = 0x02, + T3_SOLICITED_EVENT_FLAG = 0x04, + T3_READ_FENCE_FLAG = 0x08, + T3_LOCAL_FENCE_FLAG = 0x10 +} __attribute__ ((packed)); + +enum t3_wr_opcode { + T3_WR_BP = FW_WROPCODE_RI_BYPASS, + T3_WR_SEND = FW_WROPCODE_RI_SEND, + T3_WR_WRITE = FW_WROPCODE_RI_RDMA_WRITE, + T3_WR_READ = FW_WROPCODE_RI_RDMA_READ, + T3_WR_INV_STAG = FW_WROPCODE_RI_LOCAL_INV, + T3_WR_BIND = FW_WROPCODE_RI_BIND_MW, + T3_WR_RCV = FW_WROPCODE_RI_RECEIVE, + T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT, + T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP +} __attribute__ ((packed)); + +enum t3_rdma_opcode { + T3_RDMA_WRITE, /* IETF RDMAP v1.0 ... */ + T3_READ_REQ, + T3_READ_RESP, + T3_SEND, + T3_SEND_WITH_INV, + T3_SEND_WITH_SE, + T3_SEND_WITH_SE_INV, + T3_TERMINATE, + T3_RDMA_INIT, /* CHELSIO RI specific ... */ + T3_BIND_MW, + T3_FAST_REGISTER, + T3_LOCAL_INV, + T3_QP_MOD, + T3_BYPASS +} __attribute__ ((packed)); + +static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop) +{ + switch (wrop) { + case T3_WR_BP: return T3_BYPASS; + case T3_WR_SEND: return T3_SEND; + case T3_WR_WRITE: return T3_RDMA_WRITE; + case T3_WR_READ: return T3_READ_REQ; + case T3_WR_INV_STAG: return T3_LOCAL_INV; + case T3_WR_BIND: return T3_BIND_MW; + case T3_WR_INIT: return T3_RDMA_INIT; + case T3_WR_QP_MOD: return T3_QP_MOD; + default: break; + } + return -1; +} + + +/* Work request id */ +union t3_wrid { + struct { + u32 hi; + u32 low; + } id0; + u64 id1; +}; + +#define WRID(wrid) (wrid.id1) +#define WRID_GEN(wrid) (wrid.id0.wr_gen) +#define WRID_IDX(wrid) (wrid.id0.wr_idx) +#define WRID_LO(wrid) (wrid.id0.wr_lo) + +struct fw_riwrh { + __be32 op_seop_flags; + __be32 gen_tid_len; +}; + +#define S_FW_RIWR_OP 24 +#define M_FW_RIWR_OP 0xff +#define V_FW_RIWR_OP(x) ((x) << S_FW_RIWR_OP) +#define G_FW_RIWR_OP(x) ((((x) >> S_FW_RIWR_OP)) & M_FW_RIWR_OP) + +#define S_FW_RIWR_SOPEOP 22 +#define M_FW_RIWR_SOPEOP 0x3 +#define V_FW_RIWR_SOPEOP(x) ((x) << S_FW_RIWR_SOPEOP) + +#define S_FW_RIWR_FLAGS 8 +#define M_FW_RIWR_FLAGS 0x3fffff +#define V_FW_RIWR_FLAGS(x) ((x) << S_FW_RIWR_FLAGS) +#define G_FW_RIWR_FLAGS(x) ((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS) + +#define S_FW_RIWR_TID 8 +#define V_FW_RIWR_TID(x) ((x) << S_FW_RIWR_TID) + +#define S_FW_RIWR_LEN 0 +#define V_FW_RIWR_LEN(x) ((x) << S_FW_RIWR_LEN) + +#define S_FW_RIWR_GEN 31 +#define V_FW_RIWR_GEN(x) ((x) << S_FW_RIWR_GEN) + +struct t3_sge { + __be32 stag; + __be32 len; + __be64 to; +}; + +/* If num_sgle is zero, flit 5+ contains immediate data.*/ +struct t3_send_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 rem_stag; + __be32 plen; /* 3 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 4+ */ +}; + +struct t3_local_inv_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 stag; /* 2 */ + __be32 reserved3; +}; + +struct t3_rdma_write_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 stag_sink; + __be64 to_sink; /* 3 */ + __be32 plen; /* 4 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 5+ */ +}; + +struct t3_rdma_read_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 rem_stag; + __be64 rem_to; /* 3 */ + __be32 local_stag; /* 4 */ + __be32 local_len; + __be64 local_to; /* 5 */ +}; + +enum t3_addr_type { + T3_VA_BASED_TO = 0x0, + T3_ZERO_BASED_TO = 0x1 +} __attribute__ ((packed)); + +enum t3_mem_perms { + T3_MEM_ACCESS_LOCAL_READ = 0x1, + T3_MEM_ACCESS_LOCAL_WRITE = 0x2, + T3_MEM_ACCESS_REM_READ = 0x4, + T3_MEM_ACCESS_REM_WRITE = 0x8 +} __attribute__ ((packed)); + +struct t3_bind_mw_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u16 reserved; /* 2 */ + u8 type; + u8 perms; + __be32 mr_stag; + __be32 mw_stag; /* 3 */ + __be32 mw_len; + __be64 mw_va; /* 4 */ + __be32 mr_pbl_addr; /* 5 */ + u8 reserved2[3]; + u8 mr_pagesz; +}; + +struct t3_receive_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 pagesz[T3_MAX_SGE]; + __be32 num_sgle; /* 2 */ + struct t3_sge sgl[T3_MAX_SGE]; /* 3+ */ + __be32 pbl_addr[T3_MAX_SGE]; +}; + +struct t3_bypass_wr { + struct fw_riwrh wrh; + union t3_wrid wrid; /* 1 */ +}; + +struct t3_modify_qp_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 flags; /* 2 */ + __be32 quiesce; /* 2 */ + __be32 max_ird; /* 3 */ + __be32 max_ord; /* 3 */ + __be64 sge_cmd; /* 4 */ + __be64 ctx1; /* 5 */ + __be64 ctx0; /* 6 */ +}; + +enum t3_modify_qp_flags { + MODQP_QUIESCE = 0x01, + MODQP_MAX_IRD = 0x02, + MODQP_MAX_ORD = 0x04, + MODQP_WRITE_EC = 0x08, + MODQP_READ_EC = 0x10, +}; + + +enum t3_mpa_attrs { + uP_RI_MPA_RX_MARKER_ENABLE = 0x1, + uP_RI_MPA_TX_MARKER_ENABLE = 0x2, + uP_RI_MPA_CRC_ENABLE = 0x4, + uP_RI_MPA_IETF_ENABLE = 0x8 +} __attribute__ ((packed)); + +enum t3_qp_caps { + uP_RI_QP_RDMA_READ_ENABLE = 0x01, + uP_RI_QP_RDMA_WRITE_ENABLE = 0x02, + uP_RI_QP_BIND_ENABLE = 0x04, + uP_RI_QP_FAST_REGISTER_ENABLE = 0x08, + uP_RI_QP_STAG0_ENABLE = 0x10 +} __attribute__ ((packed)); + +struct t3_rdma_init_attr { + u32 tid; + u32 qpid; + u32 pdid; + u32 scqid; + u32 rcqid; + u32 rq_addr; + u32 rq_size; + enum t3_mpa_attrs mpaattrs; + enum t3_qp_caps qpcaps; + u16 tcp_emss; + u32 ord; + u32 ird; + u64 qp_dma_addr; + u32 qp_dma_size; + u32 flags; + u32 irs; +}; + +struct t3_rdma_init_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 qpid; /* 2 */ + __be32 pdid; + __be32 scqid; /* 3 */ + __be32 rcqid; + __be32 rq_addr; /* 4 */ + __be32 rq_size; + u8 mpaattrs; /* 5 */ + u8 qpcaps; + __be16 ulpdu_size; + __be32 flags; /* bits 31-1 - reservered */ + /* bit 0 - set if RECV posted */ + __be32 ord; /* 6 */ + __be32 ird; + __be64 qp_dma_addr; /* 7 */ + __be32 qp_dma_size; /* 8 */ + u32 irs; +}; + +struct t3_genbit { + u64 flit[15]; + __be64 genbit; +}; + +enum rdma_init_wr_flags { + RECVS_POSTED = 1, +}; + +union t3_wr { + struct t3_send_wr send; + struct t3_rdma_write_wr write; + struct t3_rdma_read_wr read; + struct t3_receive_wr recv; + struct t3_local_inv_wr local_inv; + struct t3_bind_mw_wr bind; + struct t3_bypass_wr bypass; + struct t3_rdma_init_wr init; + struct t3_modify_qp_wr qp_mod; + struct t3_genbit genbit; + u64 flit[16]; +}; + +#define T3_SQ_CQE_FLIT 13 +#define T3_SQ_COOKIE_FLIT 14 + +#define T3_RQ_COOKIE_FLIT 13 +#define T3_RQ_CQE_FLIT 14 + +static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe) +{ + return G_FW_RIWR_OP(be32toh(wqe->op_seop_flags)); +} + +static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op, + enum t3_wr_flags flags, u8 genbit, u32 tid, + u8 len) +{ + wqe->op_seop_flags = htobe32(V_FW_RIWR_OP(op) | + V_FW_RIWR_SOPEOP(M_FW_RIWR_SOPEOP) | + V_FW_RIWR_FLAGS(flags)); + wmb(); + wqe->gen_tid_len = htobe32(V_FW_RIWR_GEN(genbit) | + V_FW_RIWR_TID(tid) | + V_FW_RIWR_LEN(len)); + /* 2nd gen bit... */ + ((union t3_wr *)wqe)->genbit.genbit = htobe64(genbit); +} + +/* + * T3 ULP2_TX commands + */ +enum t3_utx_mem_op { + T3_UTX_MEM_READ = 2, + T3_UTX_MEM_WRITE = 3 +}; + +/* T3 MC7 RDMA TPT entry format */ + +enum tpt_mem_type { + TPT_NON_SHARED_MR = 0x0, + TPT_SHARED_MR = 0x1, + TPT_MW = 0x2, + TPT_MW_RELAXED_PROTECTION = 0x3 +}; + +enum tpt_addr_type { + TPT_ZBTO = 0, + TPT_VATO = 1 +}; + +enum tpt_mem_perm { + TPT_LOCAL_READ = 0x8, + TPT_LOCAL_WRITE = 0x4, + TPT_REMOTE_READ = 0x2, + TPT_REMOTE_WRITE = 0x1 +}; + +struct tpt_entry { + __be32 valid_stag_pdid; + __be32 flags_pagesize_qpid; + + __be32 rsvd_pbl_addr; + __be32 len; + __be32 va_hi; + __be32 va_low_or_fbo; + + __be32 rsvd_bind_cnt_or_pstag; + __be32 rsvd_pbl_size; +}; + +#define S_TPT_VALID 31 +#define V_TPT_VALID(x) ((x) << S_TPT_VALID) +#define F_TPT_VALID V_TPT_VALID(1U) + +#define S_TPT_STAG_KEY 23 +#define M_TPT_STAG_KEY 0xFF +#define V_TPT_STAG_KEY(x) ((x) << S_TPT_STAG_KEY) +#define G_TPT_STAG_KEY(x) (((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY) + +#define S_TPT_STAG_STATE 22 +#define V_TPT_STAG_STATE(x) ((x) << S_TPT_STAG_STATE) +#define F_TPT_STAG_STATE V_TPT_STAG_STATE(1U) + +#define S_TPT_STAG_TYPE 20 +#define M_TPT_STAG_TYPE 0x3 +#define V_TPT_STAG_TYPE(x) ((x) << S_TPT_STAG_TYPE) +#define G_TPT_STAG_TYPE(x) (((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE) + +#define S_TPT_PDID 0 +#define M_TPT_PDID 0xFFFFF +#define V_TPT_PDID(x) ((x) << S_TPT_PDID) +#define G_TPT_PDID(x) (((x) >> S_TPT_PDID) & M_TPT_PDID) + +#define S_TPT_PERM 28 +#define M_TPT_PERM 0xF +#define V_TPT_PERM(x) ((x) << S_TPT_PERM) +#define G_TPT_PERM(x) (((x) >> S_TPT_PERM) & M_TPT_PERM) + +#define S_TPT_REM_INV_DIS 27 +#define V_TPT_REM_INV_DIS(x) ((x) << S_TPT_REM_INV_DIS) +#define F_TPT_REM_INV_DIS V_TPT_REM_INV_DIS(1U) + +#define S_TPT_ADDR_TYPE 26 +#define V_TPT_ADDR_TYPE(x) ((x) << S_TPT_ADDR_TYPE) +#define F_TPT_ADDR_TYPE V_TPT_ADDR_TYPE(1U) + +#define S_TPT_MW_BIND_ENABLE 25 +#define V_TPT_MW_BIND_ENABLE(x) ((x) << S_TPT_MW_BIND_ENABLE) +#define F_TPT_MW_BIND_ENABLE V_TPT_MW_BIND_ENABLE(1U) + +#define S_TPT_PAGE_SIZE 20 +#define M_TPT_PAGE_SIZE 0x1F +#define V_TPT_PAGE_SIZE(x) ((x) << S_TPT_PAGE_SIZE) +#define G_TPT_PAGE_SIZE(x) (((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE) + +#define S_TPT_PBL_ADDR 0 +#define M_TPT_PBL_ADDR 0x1FFFFFFF +#define V_TPT_PBL_ADDR(x) ((x) << S_TPT_PBL_ADDR) +#define G_TPT_PBL_ADDR(x) (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR) + +#define S_TPT_QPID 0 +#define M_TPT_QPID 0xFFFFF +#define V_TPT_QPID(x) ((x) << S_TPT_QPID) +#define G_TPT_QPID(x) (((x) >> S_TPT_QPID) & M_TPT_QPID) + +#define S_TPT_PSTAG 0 +#define M_TPT_PSTAG 0xFFFFFF +#define V_TPT_PSTAG(x) ((x) << S_TPT_PSTAG) +#define G_TPT_PSTAG(x) (((x) >> S_TPT_PSTAG) & M_TPT_PSTAG) + +#define S_TPT_PBL_SIZE 0 +#define M_TPT_PBL_SIZE 0xFFFFF +#define V_TPT_PBL_SIZE(x) ((x) << S_TPT_PBL_SIZE) +#define G_TPT_PBL_SIZE(x) (((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE) + +/* + * CQE defs + */ +struct t3_cqe { + __be32 header; + __be32 len; + union { + struct { + __be32 stag; + __be32 msn; + } rcqe; + struct { + u32 wrid_hi; + u32 wrid_low; + } scqe; + } u; +}; + +#define S_CQE_OOO 31 +#define M_CQE_OOO 0x1 +#define G_CQE_OOO(x) ((((x) >> S_CQE_OOO)) & M_CQE_OOO) +#define V_CEQ_OOO(x) ((x)<<S_CQE_OOO) + +#define S_CQE_QPID 12 +#define M_CQE_QPID 0x7FFFF +#define G_CQE_QPID(x) ((((x) >> S_CQE_QPID)) & M_CQE_QPID) +#define V_CQE_QPID(x) ((x)<<S_CQE_QPID) + +#define S_CQE_SWCQE 11 +#define M_CQE_SWCQE 0x1 +#define G_CQE_SWCQE(x) ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE) +#define V_CQE_SWCQE(x) ((x)<<S_CQE_SWCQE) + +#define S_CQE_GENBIT 10 +#define M_CQE_GENBIT 0x1 +#define G_CQE_GENBIT(x) (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT) +#define V_CQE_GENBIT(x) ((x)<<S_CQE_GENBIT) + +#define S_CQE_STATUS 5 +#define M_CQE_STATUS 0x1F +#define G_CQE_STATUS(x) ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS) +#define V_CQE_STATUS(x) ((x)<<S_CQE_STATUS) + +#define S_CQE_TYPE 4 +#define M_CQE_TYPE 0x1 +#define G_CQE_TYPE(x) ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE) +#define V_CQE_TYPE(x) ((x)<<S_CQE_TYPE) + +#define S_CQE_OPCODE 0 +#define M_CQE_OPCODE 0xF +#define G_CQE_OPCODE(x) ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE) +#define V_CQE_OPCODE(x) ((x)<<S_CQE_OPCODE) + +#define SW_CQE(x) (G_CQE_SWCQE(be32toh((x).header))) +#define CQE_OOO(x) (G_CQE_OOO(be32toh((x).header))) +#define CQE_QPID(x) (G_CQE_QPID(be32toh((x).header))) +#define CQE_GENBIT(x) (G_CQE_GENBIT(be32toh((x).header))) +#define CQE_TYPE(x) (G_CQE_TYPE(be32toh((x).header))) +#define SQ_TYPE(x) (CQE_TYPE((x))) +#define RQ_TYPE(x) (!CQE_TYPE((x))) +#define CQE_STATUS(x) (G_CQE_STATUS(be32toh((x).header))) +#define CQE_OPCODE(x) (G_CQE_OPCODE(be32toh((x).header))) + +#define CQE_LEN(x) (be32toh((x).len)) + +/* used for RQ completion processing */ +#define CQE_WRID_STAG(x) (be32toh((x).u.rcqe.stag)) +#define CQE_WRID_MSN(x) (be32toh((x).u.rcqe.msn)) + +/* used for SQ completion processing */ +#define CQE_WRID_SQ_WPTR(x) ((x).u.scqe.wrid_hi) +#define CQE_WRID_WPTR(x) ((x).u.scqe.wrid_low) + +/* generic accessor macros */ +#define CQE_WRID_HI(x) ((x).u.scqe.wrid_hi) +#define CQE_WRID_LOW(x) ((x).u.scqe.wrid_low) + +#define TPT_ERR_SUCCESS 0x0 +#define TPT_ERR_STAG 0x1 /* STAG invalid: either the */ + /* STAG is offlimt, being 0, */ + /* or STAG_key mismatch */ +#define TPT_ERR_PDID 0x2 /* PDID mismatch */ +#define TPT_ERR_QPID 0x3 /* QPID mismatch */ +#define TPT_ERR_ACCESS 0x4 /* Invalid access right */ +#define TPT_ERR_WRAP 0x5 /* Wrap error */ +#define TPT_ERR_BOUND 0x6 /* base and bounds voilation */ +#define TPT_ERR_INVALIDATE_SHARED_MR 0x7 /* attempt to invalidate a */ + /* shared memory region */ +#define TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8 /* attempt to invalidate a */ + /* shared memory region */ +#define TPT_ERR_ECC 0x9 /* ECC error detected */ +#define TPT_ERR_ECC_PSTAG 0xA /* ECC error detected when */ + /* reading PSTAG for a MW */ + /* Invalidate */ +#define TPT_ERR_PBL_ADDR_BOUND 0xB /* pbl addr out of bounds: */ + /* software error */ +#define TPT_ERR_SWFLUSH 0xC /* SW FLUSHED */ +#define TPT_ERR_CRC 0x10 /* CRC error */ +#define TPT_ERR_MARKER 0x11 /* Marker error */ +#define TPT_ERR_PDU_LEN_ERR 0x12 /* invalid PDU length */ +#define TPT_ERR_OUT_OF_RQE 0x13 /* out of RQE */ +#define TPT_ERR_DDP_VERSION 0x14 /* wrong DDP version */ +#define TPT_ERR_RDMA_VERSION 0x15 /* wrong RDMA version */ +#define TPT_ERR_OPCODE 0x16 /* invalid rdma opcode */ +#define TPT_ERR_DDP_QUEUE_NUM 0x17 /* invalid ddp queue number */ +#define TPT_ERR_MSN 0x18 /* MSN error */ +#define TPT_ERR_TBIT 0x19 /* tag bit not set correctly */ +#define TPT_ERR_MO 0x1A /* MO not 0 for TERMINATE */ + /* or READ_REQ */ +#define TPT_ERR_MSN_GAP 0x1B +#define TPT_ERR_MSN_RANGE 0x1C +#define TPT_ERR_IRD_OVERFLOW 0x1D +#define TPT_ERR_RQE_ADDR_BOUND 0x1E /* RQE addr out of bounds: */ + /* software error */ +#define TPT_ERR_INTERNAL_ERR 0x1F /* internal error (opcode */ + /* mismatch) */ + +struct t3_swsq { + uint64_t wr_id; + struct t3_cqe cqe; + uint32_t sq_wptr; + uint32_t read_len; + int opcode; + int complete; + int signaled; +}; + +/* + * A T3 WQ implements both the SQ and RQ. + */ +struct t3_wq { + union t3_wr *queue; /* DMA accessable memory */ + bus_addr_t dma_addr; /* DMA address for HW */ +#ifdef notyet + DECLARE_PCI_UNMAP_ADDR(mapping) /* unmap kruft */ +#endif + u32 error; /* 1 once we go to ERROR */ + u32 qpid; + u32 wptr; /* idx to next available WR slot */ + u32 size_log2; /* total wq size */ + struct t3_swsq *sq; /* SW SQ */ + struct t3_swsq *oldest_read; /* tracks oldest pending read */ + u32 sq_wptr; /* sq_wptr - sq_rptr == count of */ + u32 sq_rptr; /* pending wrs */ + u32 sq_size_log2; /* sq size */ + u64 *rq; /* SW RQ (holds consumer wr_ids */ + u32 rq_wptr; /* rq_wptr - rq_rptr == count of */ + u32 rq_rptr; /* pending wrs */ + u64 *rq_oldest_wr; /* oldest wr on the SW RQ */ + u32 rq_size_log2; /* rq size */ + u32 rq_addr; /* rq adapter address */ + void /* __iomem */ *doorbell; /* kernel db */ + u64 udb; /* user db if any */ +}; + +struct t3_cq { + u32 cqid; + u32 rptr; + u32 wptr; + u32 size_log2; + bus_addr_t dma_addr; +#ifdef notyet + DECLARE_PCI_UNMAP_ADDR(mapping) +#endif + struct t3_cqe *queue; + struct t3_cqe *sw_queue; + u32 sw_rptr; + u32 sw_wptr; +}; + +#define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \ + CQE_GENBIT(*cqe)) + +static inline void cxio_set_wq_in_error(struct t3_wq *wq) +{ + wq->queue->flit[13] = 1; +} + +static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + return NULL; +} + +static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +#endif diff --git a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h index 6c97a27f674b8..56ccda949beb8 100644 --- a/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h +++ b/sys/dev/cxgb/ulp/toecore/cxgb_toedev.h @@ -31,7 +31,10 @@ $FreeBSD$ ***************************************************************************/ #ifndef _CXGB_TOEDEV_H_ -#define _CXGB_TOEDEV_H_ +#define _CXGB_TOEDEV_H_ +#ifdef notyet +#include <netinet/toedev.h> +#endif /* offload type ids */ enum { diff --git a/sys/dev/cxgb/ulp/toecore/toedev.c b/sys/dev/cxgb/ulp/toecore/toedev.c new file mode 100644 index 0000000000000..07a0d6e94feb9 --- /dev/null +++ b/sys/dev/cxgb/ulp/toecore/toedev.c @@ -0,0 +1,424 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <sys/module.h> +#include <sys/queue.h> +#include <sys/mbuf.h> +#include <sys/proc.h> + +#include <sys/socket.h> +#include <sys/sockio.h> + +#include <net/bpf.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <net/route.h> + + +/* + * XXX + */ +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#else +#include <dev/cxgb/cxgb_include.h> +#endif + + + +static struct mtx offload_db_lock; +static TAILQ_HEAD(, toedev) offload_dev_list; +static TAILQ_HEAD(, tom_info) offload_module_list; + +/* + * Returns the entry in the given table with the given offload id, or NULL + * if the id is not found. + */ +static const struct offload_id * +id_find(unsigned int id, const struct offload_id *table) +{ + for ( ; table->id; ++table) + if (table->id == id) + return table; + return NULL; +} + +/* + * Returns true if an offload device is presently attached to an offload module. + */ +static inline int +is_attached(const struct toedev *dev) +{ + return dev->tod_offload_mod != NULL; +} + +/* + * Try to attach a new offload device to an existing TCP offload module that + * can handle the device's offload id. Returns 0 if it succeeds. + * + * Must be called with the offload_db_lock held. + */ +static int +offload_attach(struct toedev *dev) +{ + struct tom_info *t; + + TAILQ_FOREACH(t, &offload_module_list, entry) { + const struct offload_id *entry; + + entry = id_find(dev->tod_ttid, t->ti_id_table); + if (entry && t->ti_attach(dev, entry) == 0) { + dev->tod_offload_mod = t; + return 0; + } + } + return (ENOPROTOOPT); +} + +/** + * register_tom - register a TCP Offload Module (TOM) + * @t: the offload module to register + * + * Register a TCP Offload Module (TOM). + */ +int +register_tom(struct tom_info *t) +{ + mtx_lock(&offload_db_lock); + TAILQ_INSERT_HEAD(&offload_module_list, t, entry); + mtx_unlock(&offload_db_lock); + return 0; +} + +/** + * unregister_tom - unregister a TCP Offload Module (TOM) + * @t: the offload module to register + * + * Unregister a TCP Offload Module (TOM). Note that this does not affect any + * TOE devices to which the TOM is already attached. + */ +int +unregister_tom(struct tom_info *t) +{ + mtx_lock(&offload_db_lock); + TAILQ_REMOVE(&offload_module_list, t, entry); + mtx_unlock(&offload_db_lock); + return 0; +} + +/* + * Find an offload device by name. Must be called with offload_db_lock held. + */ +static struct toedev * +__find_offload_dev_by_name(const char *name) +{ + struct toedev *dev; + + TAILQ_FOREACH(dev, &offload_dev_list, entry) { + if (!strncmp(dev->tod_name, name, TOENAMSIZ)) + return dev; + } + return NULL; +} + +/* + * Returns true if an offload device is already registered. + * Must be called with the offload_db_lock held. + */ +static int +is_registered(const struct toedev *dev) +{ + struct toedev *d; + + TAILQ_FOREACH(d, &offload_dev_list, entry) { + if (d == dev) + return 1; + } + return 0; +} + +/* + * Finalize the name of an offload device by assigning values to any format + * strings in its name. + */ +static int +assign_name(struct toedev *dev, const char *name, int limit) +{ + int i; + + for (i = 0; i < limit; ++i) { + char s[TOENAMSIZ]; + + if (snprintf(s, sizeof(s), name, i) >= sizeof(s)) + return -1; /* name too long */ + if (!__find_offload_dev_by_name(s)) { + strcpy(dev->tod_name, s); + return 0; + } + } + return -1; +} + +/** + * register_toedev - register a TOE device + * @dev: the device + * @name: a name template for the device + * + * Register a TOE device and try to attach an appropriate TCP offload module + * to it. @name is a template that may contain at most one %d format + * specifier. + */ +int +register_toedev(struct toedev *dev, const char *name) +{ + int ret; + const char *p; + + /* + * Validate the name template. Only one %d allowed and name must be + * a valid filename so it can appear in sysfs. + */ + if (!name || !*name || !strcmp(name, ".") || !strcmp(name, "..") || + strchr(name, '/')) + return EINVAL; + + p = strchr(name, '%'); + if (p && (p[1] != 'd' || strchr(p + 2, '%'))) + return EINVAL; + + mtx_lock(&offload_db_lock); + if (is_registered(dev)) { /* device already registered */ + ret = EEXIST; + goto out; + } + + if ((ret = assign_name(dev, name, 32)) != 0) + goto out; + + dev->tod_offload_mod = NULL; + TAILQ_INSERT_TAIL(&offload_dev_list, dev, entry); +out: + mtx_unlock(&offload_db_lock); + return ret; +} + +/** + * unregister_toedev - unregister a TOE device + * @dev: the device + * + * Unregister a TOE device. The device must not be attached to an offload + * module. + */ +int +unregister_toedev(struct toedev *dev) +{ + int ret = 0; + + mtx_lock(&offload_db_lock); + if (!is_registered(dev)) { + ret = ENODEV; + goto out; + } + if (is_attached(dev)) { + ret = EBUSY; + goto out; + } + TAILQ_REMOVE(&offload_dev_list, dev, entry); +out: + mtx_unlock(&offload_db_lock); + return ret; +} + +/** + * activate_offload - activate an offload device + * @dev: the device + * + * Activate an offload device by locating an appropriate registered offload + * module. If no module is found the operation fails and may be retried at + * a later time. + */ +int +activate_offload(struct toedev *dev) +{ + int ret = 0; + + mtx_lock(&offload_db_lock); + if (!is_registered(dev)) + ret = ENODEV; + else if (!is_attached(dev)) + ret = offload_attach(dev); + mtx_unlock(&offload_db_lock); + return ret; +} + +/** + * toe_send - send a packet to a TOE device + * @dev: the device + * @m: the packet + * + * Sends an mbuf to a TOE driver after dealing with any active network taps. + */ +int +toe_send(struct toedev *dev, struct mbuf *m) +{ + int r; + + critical_enter(); /* XXX neccessary? */ + r = dev->tod_send(dev, m); + critical_exit(); + if (r) + BPF_MTAP(dev->tod_lldev, m); + return r; +} + +/** + * toe_receive_mbuf - process n received TOE packets + * @dev: the toe device + * @m: an array of offload packets + * @n: the number of offload packets + * + * Process an array of ingress offload packets. Each packet is forwarded + * to any active network taps and then passed to the toe device's receive + * method. We optimize passing packets to the receive method by passing + * it the whole array at once except when there are active taps. + */ +int +toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n) +{ + if (__predict_true(!bpf_peers_present(dev->tod_lldev->if_bpf))) + return dev->tod_recv(dev, m, n); + + for ( ; n; n--, m++) { + m[0]->m_pkthdr.rcvif = dev->tod_lldev; + BPF_MTAP(dev->tod_lldev, m[0]); + dev->tod_recv(dev, m, 1); + } + return 0; +} + +static inline int +ifnet_is_offload(const struct ifnet *ifp) +{ + return (ifp->if_flags & IFCAP_TOE); +} + +void +toe_arp_update(struct rtentry *rt) +{ + struct ifnet *ifp = rt->rt_ifp; + + if (ifp && ifnet_is_offload(ifp)) { + struct toedev *tdev = TOEDEV(ifp); + + if (tdev && tdev->tod_arp_update) + tdev->tod_arp_update(tdev, rt); + } +} + +/** + * offload_get_phys_egress - find the physical egress device + * @root_dev: the root device anchoring the search + * @so: the socket used to determine egress port in bonding mode + * @context: in bonding mode, indicates a connection set up or failover + * + * Given a root network device it returns the physical egress device that is a + * descendant of the root device. The root device may be either a physical + * device, in which case it is the device returned, or a virtual device, such + * as a VLAN or bonding device. In case of a bonding device the search + * considers the decisions of the bonding device given its mode to locate the + * correct egress device. + */ +struct ifnet * +offload_get_phys_egress(struct ifnet *root_dev, struct socket *so, int context) +{ + +#if 0 + while (root_dev && ifnet_is_offload(root_dev)) { + if (root_dev->tod_priv_flags & IFF_802_1Q_VLAN) + root_dev = VLAN_DEV_INFO(root_dev)->real_dev; + else if (root_dev->tod_flags & IFF_MASTER) + root_dev = toe_bond_get_slave(root_dev, sk, context); + else + break; + } +#endif + return root_dev; +} + +static int +toecore_load(module_t mod, int cmd, void *arg) +{ + int err = 0; + + switch (cmd) { + case MOD_LOAD: + mtx_init(&offload_db_lock, "toedev lock", NULL, MTX_DEF); + TAILQ_INIT(&offload_dev_list); + TAILQ_INIT(&offload_module_list); + break; + case MOD_QUIESCE: + break; + case MOD_UNLOAD: + mtx_lock(&offload_db_lock); + if (!TAILQ_EMPTY(&offload_dev_list) || + !TAILQ_EMPTY(&offload_module_list)) { + err = EBUSY; + mtx_unlock(&offload_db_lock); + break; + } + mtx_unlock(&offload_db_lock); + mtx_destroy(&offload_db_lock); + break; + case MOD_SHUTDOWN: + break; + default: + err = EOPNOTSUPP; + break; + } + + return (err); +} + + +static moduledata_t mod_data= { + "toecore", + toecore_load, + 0 +}; + +MODULE_VERSION(toecore, 1); +DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c new file mode 100644 index 0000000000000..00b45750e752b --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c @@ -0,0 +1,4456 @@ +/************************************************************************** + +Copyright (c) 2007-2008, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/ktr.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/protosw.h> +#include <sys/priv.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> + + +#include <dev/cxgb/cxgb_osdep.h> +#include <dev/cxgb/sys/mbufq.h> + +#include <netinet/ip.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_offload.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_syncache.h> +#include <netinet/tcp_timer.h> +#include <net/route.h> + +#include <dev/cxgb/t3cdev.h> +#include <dev/cxgb/common/cxgb_firmware_exports.h> +#include <dev/cxgb/common/cxgb_t3_cpl.h> +#include <dev/cxgb/common/cxgb_tcb.h> +#include <dev/cxgb/common/cxgb_ctl_defs.h> +#include <dev/cxgb/cxgb_offload.h> +#include <vm/vm.h> +#include <vm/pmap.h> +#include <machine/bus.h> +#include <dev/cxgb/sys/mvec.h> +#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> +#include <dev/cxgb/ulp/tom/cxgb_defs.h> +#include <dev/cxgb/ulp/tom/cxgb_tom.h> +#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> +#include <dev/cxgb/ulp/tom/cxgb_tcp.h> + +#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h> + +/* + * For ULP connections HW may add headers, e.g., for digests, that aren't part + * of the messages sent by the host but that are part of the TCP payload and + * therefore consume TCP sequence space. Tx connection parameters that + * operate in TCP sequence space are affected by the HW additions and need to + * compensate for them to accurately track TCP sequence numbers. This array + * contains the compensating extra lengths for ULP packets. It is indexed by + * a packet's ULP submode. + */ +const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; + +#ifdef notyet +/* + * This sk_buff holds a fake header-only TCP segment that we use whenever we + * need to exploit SW TCP functionality that expects TCP headers, such as + * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple + * CPUs without locking. + */ +static struct mbuf *tcphdr_mbuf __read_mostly; +#endif + +/* + * Size of WRs in bytes. Note that we assume all devices we are handling have + * the same WR size. + */ +static unsigned int wrlen __read_mostly; + +/* + * The number of WRs needed for an skb depends on the number of page fragments + * in the skb and whether it has any payload in its main body. This maps the + * length of the gather list represented by an skb into the # of necessary WRs. + */ +static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; + +/* + * Max receive window supported by HW in bytes. Only a small part of it can + * be set through option0, the rest needs to be set through RX_DATA_ACK. + */ +#define MAX_RCV_WND ((1U << 27) - 1) + +/* + * Min receive window. We want it to be large enough to accommodate receive + * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. + */ +#define MIN_RCV_WND (24 * 1024U) +#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) + +#define VALIDATE_SEQ 0 +#define VALIDATE_SOCK(so) +#define DEBUG_WR 0 + +#define TCP_TIMEWAIT 1 +#define TCP_CLOSE 2 +#define TCP_DROP 3 + +extern int tcp_do_autorcvbuf; +extern int tcp_do_autosndbuf; +extern int tcp_autorcvbuf_max; +extern int tcp_autosndbuf_max; + +static void t3_send_reset(struct toepcb *toep); +static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); +static inline void free_atid(struct t3cdev *cdev, unsigned int tid); +static void handle_syncache_event(int event, void *arg); + +static inline void +SBAPPEND(struct sockbuf *sb, struct mbuf *n) +{ + struct mbuf *m; + + m = sb->sb_mb; + while (m) { + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || + !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", + !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); + KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", + m->m_next, m->m_nextpkt, m->m_flags)); + m = m->m_next; + } + m = n; + while (m) { + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || + !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", + !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); + KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", + m->m_next, m->m_nextpkt, m->m_flags)); + m = m->m_next; + } + KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); + sbappendstream_locked(sb, n); + m = sb->sb_mb; + + while (m) { + KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", + m->m_next, m->m_nextpkt, m->m_flags)); + m = m->m_next; + } +} + +static inline int +is_t3a(const struct toedev *dev) +{ + return (dev->tod_ttid == TOE_ID_CHELSIO_T3); +} + +static void +dump_toepcb(struct toepcb *toep) +{ + DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", + toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, + toep->tp_mtu_idx, toep->tp_tid); + + DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", + toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, + toep->tp_mss_clamp, toep->tp_flags); +} + +#ifndef RTALLOC2_DEFINED +static struct rtentry * +rtalloc2(struct sockaddr *dst, int report, u_long ignflags) +{ + struct rtentry *rt = NULL; + + if ((rt = rtalloc1(dst, report, ignflags)) != NULL) + RT_UNLOCK(rt); + + return (rt); +} +#endif + +/* + * Determine whether to send a CPL message now or defer it. A message is + * deferred if the connection is in SYN_SENT since we don't know the TID yet. + * For connections in other states the message is sent immediately. + * If through_l2t is set the message is subject to ARP processing, otherwise + * it is sent directly. + */ +static inline void +send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) +{ + struct tcpcb *tp = toep->tp_tp; + + if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { + inp_wlock(tp->t_inpcb); + mbufq_tail(&toep->out_of_order_queue, m); // defer + inp_wunlock(tp->t_inpcb); + } else if (through_l2t) + l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T + else + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly +} + +static inline unsigned int +mkprio(unsigned int cntrl, const struct toepcb *toep) +{ + return (cntrl); +} + +/* + * Populate a TID_RELEASE WR. The skb must be already propely sized. + */ +static inline void +mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) +{ + struct cpl_tid_release *req; + + m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); + m->m_pkthdr.len = m->m_len = sizeof(*req); + req = mtod(m, struct cpl_tid_release *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); +} + +static inline void +make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct tx_data_wr *req; + struct sockbuf *snd; + + inp_lock_assert(tp->t_inpcb); + snd = so_sockbuf_snd(so); + + req = mtod(m, struct tx_data_wr *); + m->m_len = sizeof(*req); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); + /* len includes the length of any HW ULP additions */ + req->len = htonl(len); + req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); + /* V_TX_ULP_SUBMODE sets both the mode and submode */ + req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | + V_TX_URG(/* skb_urgent(skb) */ 0 ) | + V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && + (tail ? 0 : 1)))); + req->sndseq = htonl(tp->snd_nxt); + if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { + req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | + V_TX_CPU_IDX(toep->tp_qset)); + + /* Sendbuffer is in units of 32KB. + */ + if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) + req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15)); + else { + req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); + } + + toep->tp_flags |= TP_DATASENT; + } +} + +#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ + +int +t3_push_frames(struct socket *so, int req_completion) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + struct mbuf *tail, *m0, *last; + struct t3cdev *cdev; + struct tom_data *d; + int state, bytes, count, total_bytes; + bus_dma_segment_t segs[TX_MAX_SEGS], *segp; + struct sockbuf *snd; + + if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { + DPRINTF("tcp state=%d\n", tp->t_state); + return (0); + } + + state = so_state_get(so); + + if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { + DPRINTF("disconnecting\n"); + + return (0); + } + + inp_lock_assert(tp->t_inpcb); + + snd = so_sockbuf_snd(so); + sockbuf_lock(snd); + + d = TOM_DATA(toep->tp_toedev); + cdev = d->cdev; + + last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; + + total_bytes = 0; + DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", + toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); + + if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { + KASSERT(tail, ("sbdrop error")); + last = tail = tail->m_next; + } + + if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { + DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); + sockbuf_unlock(snd); + + return (0); + } + + toep->tp_m_last = NULL; + while (toep->tp_wr_avail && (tail != NULL)) { + count = bytes = 0; + segp = segs; + if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { + sockbuf_unlock(snd); + return (0); + } + /* + * If the data in tail fits as in-line, then + * make an immediate data wr. + */ + if (tail->m_len <= IMM_LEN) { + count = 1; + bytes = tail->m_len; + last = tail; + tail = tail->m_next; + m_set_sgl(m0, NULL); + m_set_sgllen(m0, 0); + make_tx_data_wr(so, m0, bytes, tail); + m_append(m0, bytes, mtod(last, caddr_t)); + KASSERT(!m0->m_next, ("bad append")); + } else { + while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) + && (tail != NULL) && (count < TX_MAX_SEGS-1)) { + bytes += tail->m_len; + last = tail; + count++; + /* + * technically an abuse to be using this for a VA + * but less gross than defining my own structure + * or calling pmap_kextract from here :-| + */ + segp->ds_addr = (bus_addr_t)tail->m_data; + segp->ds_len = tail->m_len; + DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", + count, mbuf_wrs[count], tail->m_data, tail->m_len); + segp++; + tail = tail->m_next; + } + DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", + toep->tp_wr_avail, count, mbuf_wrs[count], tail); + + m_set_sgl(m0, segs); + m_set_sgllen(m0, count); + make_tx_data_wr(so, m0, bytes, tail); + } + m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); + + if (tail) { + snd->sb_sndptr = tail; + toep->tp_m_last = NULL; + } else + toep->tp_m_last = snd->sb_sndptr = last; + + + DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); + + snd->sb_sndptroff += bytes; + total_bytes += bytes; + toep->tp_write_seq += bytes; + CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" + " tail=%p sndptr=%p sndptroff=%d", + toep->tp_wr_avail, count, mbuf_wrs[count], + tail, snd->sb_sndptr, snd->sb_sndptroff); + if (tail) + CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" + " tp_m_last=%p tailbuf=%p snd_una=0x%08x", + total_bytes, toep->tp_m_last, tail->m_data, + tp->snd_una); + else + CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" + " tp_m_last=%p snd_una=0x%08x", + total_bytes, toep->tp_m_last, tp->snd_una); + + +#ifdef KTR +{ + int i; + + i = 0; + while (i < count && m_get_sgllen(m0)) { + if ((count - i) >= 3) { + CTR6(KTR_TOM, + "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" + " len=%d pa=0x%zx len=%d", + segs[i].ds_addr, segs[i].ds_len, + segs[i + 1].ds_addr, segs[i + 1].ds_len, + segs[i + 2].ds_addr, segs[i + 2].ds_len); + i += 3; + } else if ((count - i) == 2) { + CTR4(KTR_TOM, + "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" + " len=%d", + segs[i].ds_addr, segs[i].ds_len, + segs[i + 1].ds_addr, segs[i + 1].ds_len); + i += 2; + } else { + CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", + segs[i].ds_addr, segs[i].ds_len); + i++; + } + + } +} +#endif + /* + * remember credits used + */ + m0->m_pkthdr.csum_data = mbuf_wrs[count]; + m0->m_pkthdr.len = bytes; + toep->tp_wr_avail -= mbuf_wrs[count]; + toep->tp_wr_unacked += mbuf_wrs[count]; + + if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || + toep->tp_wr_unacked >= toep->tp_wr_max / 2) { + struct work_request_hdr *wr = cplhdr(m0); + + wr->wr_hi |= htonl(F_WR_COMPL); + toep->tp_wr_unacked = 0; + } + KASSERT((m0->m_pkthdr.csum_data > 0) && + (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", + m0->m_pkthdr.csum_data)); + m0->m_type = MT_DONTFREE; + enqueue_wr(toep, m0); + DPRINTF("sending offload tx with %d bytes in %d segments\n", + bytes, count); + l2t_send(cdev, m0, toep->tp_l2t); + } + sockbuf_unlock(snd); + return (total_bytes); +} + +/* + * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail + * under any circumstances. We take the easy way out and always queue the + * message to the write_queue. We can optimize the case where the queue is + * already empty though the optimization is probably not worth it. + */ +static void +close_conn(struct socket *so) +{ + struct mbuf *m; + struct cpl_close_con_req *req; + struct tom_data *d; + struct inpcb *inp = so_sotoinpcb(so); + struct tcpcb *tp; + struct toepcb *toep; + unsigned int tid; + + + inp_wlock(inp); + tp = so_sototcpcb(so); + toep = tp->t_toe; + + if (tp->t_state != TCPS_SYN_SENT) + t3_push_frames(so, 1); + + if (toep->tp_flags & TP_FIN_SENT) { + inp_wunlock(inp); + return; + } + + tid = toep->tp_tid; + + d = TOM_DATA(toep->tp_toedev); + + m = m_gethdr_nofail(sizeof(*req)); + m_set_priority(m, CPL_PRIORITY_DATA); + m_set_sgl(m, NULL); + m_set_sgllen(m, 0); + + toep->tp_flags |= TP_FIN_SENT; + req = mtod(m, struct cpl_close_con_req *); + + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); + req->rsvd = 0; + inp_wunlock(inp); + /* + * XXX - need to defer shutdown while there is still data in the queue + * + */ + CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); + cxgb_ofld_send(d->cdev, m); + +} + +/* + * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant + * and send it along. + */ +static void +abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) +{ + struct cpl_abort_req *req = cplhdr(m); + + req->cmd = CPL_ABORT_NO_RST; + cxgb_ofld_send(cdev, m); +} + +/* + * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are + * permitted to return without sending the message in case we cannot allocate + * an sk_buff. Returns the number of credits sent. + */ +uint32_t +t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) +{ + struct mbuf *m; + struct cpl_rx_data_ack *req; + struct toepcb *toep = tp->t_toe; + struct toedev *tdev = toep->tp_toedev; + + m = m_gethdr_nofail(sizeof(*req)); + + DPRINTF("returning %u credits to HW\n", credits); + + req = mtod(m, struct cpl_rx_data_ack *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); + req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); + m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); + return (credits); +} + +/* + * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. + * This is only used in DDP mode, so we take the opportunity to also set the + * DACK mode and flush any Rx credits. + */ +void +t3_send_rx_modulate(struct toepcb *toep) +{ + struct mbuf *m; + struct cpl_rx_data_ack *req; + + m = m_gethdr_nofail(sizeof(*req)); + + req = mtod(m, struct cpl_rx_data_ack *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; + m->m_pkthdr.len = m->m_len = sizeof(*req); + + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); + req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | + V_RX_DACK_MODE(1) | + V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); + toep->tp_rcv_wup = toep->tp_copied_seq; +} + +/* + * Handle receipt of an urgent pointer. + */ +static void +handle_urg_ptr(struct socket *so, uint32_t urg_seq) +{ +#ifdef URGENT_DATA_SUPPORTED + struct tcpcb *tp = so_sototcpcb(so); + + urg_seq--; /* initially points past the urgent data, per BSD */ + + if (tp->urg_data && !after(urg_seq, tp->urg_seq)) + return; /* duplicate pointer */ + sk_send_sigurg(sk); + if (tp->urg_seq == tp->copied_seq && tp->urg_data && + !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + + tp->copied_seq++; + if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) + tom_eat_skb(sk, skb, 0); + } + tp->urg_data = TCP_URG_NOTYET; + tp->urg_seq = urg_seq; +#endif +} + +/* + * Returns true if a socket cannot accept new Rx data. + */ +static inline int +so_no_receive(const struct socket *so) +{ + return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); +} + +/* + * Process an urgent data notification. + */ +static void +rx_urg_notify(struct toepcb *toep, struct mbuf *m) +{ + struct cpl_rx_urg_notify *hdr = cplhdr(m); + struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); + + VALIDATE_SOCK(so); + + if (!so_no_receive(so)) + handle_urg_ptr(so, ntohl(hdr->seq)); + + m_freem(m); +} + +/* + * Handler for RX_URG_NOTIFY CPL messages. + */ +static int +do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + rx_urg_notify(toep, m); + return (0); +} + +static __inline int +is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) +{ + return (toep->tp_ulp_mode || + (toep->tp_ulp_mode == ULP_MODE_TCPDDP && + dev->tod_ttid >= TOE_ID_CHELSIO_T3)); +} + +/* + * Set of states for which we should return RX credits. + */ +#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) + +/* + * Called after some received data has been read. It returns RX credits + * to the HW for the amount of data processed. + */ +void +t3_cleanup_rbuf(struct tcpcb *tp, int copied) +{ + struct toepcb *toep = tp->t_toe; + struct socket *so; + struct toedev *dev; + int dack_mode, must_send, read; + u32 thres, credits, dack = 0; + struct sockbuf *rcv; + + so = inp_inpcbtosocket(tp->t_inpcb); + rcv = so_sockbuf_rcv(so); + + if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || + (tp->t_state == TCPS_FIN_WAIT_2))) { + if (copied) { + sockbuf_lock(rcv); + toep->tp_copied_seq += copied; + sockbuf_unlock(rcv); + } + + return; + } + + inp_lock_assert(tp->t_inpcb); + + sockbuf_lock(rcv); + if (copied) + toep->tp_copied_seq += copied; + else { + read = toep->tp_enqueued_bytes - rcv->sb_cc; + toep->tp_copied_seq += read; + } + credits = toep->tp_copied_seq - toep->tp_rcv_wup; + toep->tp_enqueued_bytes = rcv->sb_cc; + sockbuf_unlock(rcv); + + if (credits > rcv->sb_mbmax) { + log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", + toep->tp_copied_seq, toep->tp_rcv_wup, credits); + credits = rcv->sb_mbmax; + } + + + /* + * XXX this won't accurately reflect credit return - we need + * to look at the difference between the amount that has been + * put in the recv sockbuf and what is there now + */ + + if (__predict_false(!credits)) + return; + + dev = toep->tp_toedev; + thres = TOM_TUNABLE(dev, rx_credit_thres); + + if (__predict_false(thres == 0)) + return; + + if (is_delack_mode_valid(dev, toep)) { + dack_mode = TOM_TUNABLE(dev, delack); + if (__predict_false(dack_mode != toep->tp_delack_mode)) { + u32 r = tp->rcv_nxt - toep->tp_delack_seq; + + if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) + dack = F_RX_DACK_CHANGE | + V_RX_DACK_MODE(dack_mode); + } + } else + dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); + + /* + * For coalescing to work effectively ensure the receive window has + * at least 16KB left. + */ + must_send = credits + 16384 >= tp->rcv_wnd; + + if (must_send || credits >= thres) + toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); +} + +static int +cxgb_toe_disconnect(struct tcpcb *tp) +{ + struct socket *so; + + DPRINTF("cxgb_toe_disconnect\n"); + + so = inp_inpcbtosocket(tp->t_inpcb); + close_conn(so); + return (0); +} + +static int +cxgb_toe_reset(struct tcpcb *tp) +{ + struct toepcb *toep = tp->t_toe; + + t3_send_reset(toep); + + /* + * unhook from socket + */ + tp->t_flags &= ~TF_TOE; + toep->tp_tp = NULL; + tp->t_toe = NULL; + return (0); +} + +static int +cxgb_toe_send(struct tcpcb *tp) +{ + struct socket *so; + + DPRINTF("cxgb_toe_send\n"); + dump_toepcb(tp->t_toe); + + so = inp_inpcbtosocket(tp->t_inpcb); + t3_push_frames(so, 1); + return (0); +} + +static int +cxgb_toe_rcvd(struct tcpcb *tp) +{ + + inp_lock_assert(tp->t_inpcb); + + t3_cleanup_rbuf(tp, 0); + + return (0); +} + +static void +cxgb_toe_detach(struct tcpcb *tp) +{ + struct toepcb *toep; + + /* + * XXX how do we handle teardown in the SYN_SENT state? + * + */ + inp_lock_assert(tp->t_inpcb); + toep = tp->t_toe; + toep->tp_tp = NULL; + + /* + * unhook from socket + */ + tp->t_flags &= ~TF_TOE; + tp->t_toe = NULL; +} + + +static struct toe_usrreqs cxgb_toe_usrreqs = { + .tu_disconnect = cxgb_toe_disconnect, + .tu_reset = cxgb_toe_reset, + .tu_send = cxgb_toe_send, + .tu_rcvd = cxgb_toe_rcvd, + .tu_detach = cxgb_toe_detach, + .tu_detach = cxgb_toe_detach, + .tu_syncache_event = handle_syncache_event, +}; + + +static void +__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, + uint64_t mask, uint64_t val, int no_reply) +{ + struct cpl_set_tcb_field *req; + + CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", + toep->tp_tid, word, mask, val); + + req = mtod(m, struct cpl_set_tcb_field *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); + req->reply = V_NO_REPLY(no_reply); + req->cpu_idx = 0; + req->word = htons(word); + req->mask = htobe64(mask); + req->val = htobe64(val); + + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + send_or_defer(toep, m, 0); +} + +static void +t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) +{ + struct mbuf *m; + struct tcpcb *tp = toep->tp_tp; + + if (toep == NULL) + return; + + if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { + printf("not seting field\n"); + return; + } + + m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); + + __set_tcb_field(toep, m, word, mask, val, 1); +} + +/* + * Set one of the t_flags bits in the TCB. + */ +static void +set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) +{ + + t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); +} + +/* + * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. + */ +static void +t3_set_nagle(struct toepcb *toep) +{ + struct tcpcb *tp = toep->tp_tp; + + set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); +} + +/* + * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. + */ +void +t3_set_keepalive(struct toepcb *toep, int on_off) +{ + + set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); +} + +void +t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) +{ + set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); +} + +void +t3_set_dack_mss(struct toepcb *toep, int on_off) +{ + + set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); +} + +/* + * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. + */ +static void +t3_set_tos(struct toepcb *toep) +{ + int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); + + t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), + V_TCB_TOS(tos)); +} + + +/* + * In DDP mode, TP fails to schedule a timer to push RX data to the host when + * DDP is disabled (data is delivered to freelist). [Note that, the peer should + * set the PSH bit in the last segment, which would trigger delivery.] + * We work around the issue by setting a DDP buffer in a partial placed state, + * which guarantees that TP will schedule a timer. + */ +#define TP_DDP_TIMER_WORKAROUND_MASK\ + (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ + ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ + V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) +#define TP_DDP_TIMER_WORKAROUND_VAL\ + (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ + ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ + 32)) + +static void +t3_enable_ddp(struct toepcb *toep, int on) +{ + if (on) { + + t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), + V_TF_DDP_OFF(0)); + } else + t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_OFF(1) | + TP_DDP_TIMER_WORKAROUND_MASK, + V_TF_DDP_OFF(1) | + TP_DDP_TIMER_WORKAROUND_VAL); + +} + +void +t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) +{ + t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, + V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), + tag_color); +} + +void +t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, + unsigned int len) +{ + if (buf_idx == 0) + t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, + V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | + V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), + V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | + V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); + else + t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, + V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | + V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), + V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | + V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); +} + +static int +t3_set_cong_control(struct socket *so, const char *name) +{ +#ifdef CONGESTION_CONTROL_SUPPORTED + int cong_algo; + + for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) + if (!strcmp(name, t3_cong_ops[cong_algo].name)) + break; + + if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) + return -EINVAL; +#endif + return 0; +} + +int +t3_get_tcb(struct toepcb *toep) +{ + struct cpl_get_tcb *req; + struct tcpcb *tp = toep->tp_tp; + struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); + + if (!m) + return (ENOMEM); + + inp_lock_assert(tp->t_inpcb); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + req = mtod(m, struct cpl_get_tcb *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); + req->cpuno = htons(toep->tp_qset); + req->rsvd = 0; + if (tp->t_state == TCPS_SYN_SENT) + mbufq_tail(&toep->out_of_order_queue, m); // defer + else + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); + return 0; +} + +static inline void +so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) +{ + + toepcb_hold(toep); + + cxgb_insert_tid(d->cdev, d->client, toep, tid); +} + +/** + * find_best_mtu - find the entry in the MTU table closest to an MTU + * @d: TOM state + * @mtu: the target MTU + * + * Returns the index of the value in the MTU table that is closest to but + * does not exceed the target MTU. + */ +static unsigned int +find_best_mtu(const struct t3c_data *d, unsigned short mtu) +{ + int i = 0; + + while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) + ++i; + return (i); +} + +static unsigned int +select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) +{ + unsigned int idx; + +#ifdef notyet + struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; +#endif + if (tp) { + tp->t_maxseg = pmtu - 40; + if (tp->t_maxseg < td->mtus[0] - 40) + tp->t_maxseg = td->mtus[0] - 40; + idx = find_best_mtu(td, tp->t_maxseg + 40); + + tp->t_maxseg = td->mtus[idx] - 40; + } else + idx = find_best_mtu(td, pmtu); + + return (idx); +} + +static inline void +free_atid(struct t3cdev *cdev, unsigned int tid) +{ + struct toepcb *toep = cxgb_free_atid(cdev, tid); + + if (toep) + toepcb_release(toep); +} + +/* + * Release resources held by an offload connection (TID, L2T entry, etc.) + */ +static void +t3_release_offload_resources(struct toepcb *toep) +{ + struct tcpcb *tp = toep->tp_tp; + struct toedev *tdev = toep->tp_toedev; + struct t3cdev *cdev; + struct socket *so; + unsigned int tid = toep->tp_tid; + struct sockbuf *rcv; + + CTR0(KTR_TOM, "t3_release_offload_resources"); + + if (!tdev) + return; + + cdev = TOEP_T3C_DEV(toep); + if (!cdev) + return; + + toep->tp_qset = 0; + t3_release_ddp_resources(toep); + +#ifdef CTRL_SKB_CACHE + kfree_skb(CTRL_SKB_CACHE(tp)); + CTRL_SKB_CACHE(tp) = NULL; +#endif + + if (toep->tp_wr_avail != toep->tp_wr_max) { + purge_wr_queue(toep); + reset_wr_list(toep); + } + + if (toep->tp_l2t) { + l2t_release(L2DATA(cdev), toep->tp_l2t); + toep->tp_l2t = NULL; + } + toep->tp_tp = NULL; + if (tp) { + inp_lock_assert(tp->t_inpcb); + so = inp_inpcbtosocket(tp->t_inpcb); + rcv = so_sockbuf_rcv(so); + /* + * cancel any offloaded reads + * + */ + sockbuf_lock(rcv); + tp->t_toe = NULL; + tp->t_flags &= ~TF_TOE; + if (toep->tp_ddp_state.user_ddp_pending) { + t3_cancel_ubuf(toep, rcv); + toep->tp_ddp_state.user_ddp_pending = 0; + } + so_sorwakeup_locked(so); + + } + + if (toep->tp_state == TCPS_SYN_SENT) { + free_atid(cdev, tid); +#ifdef notyet + __skb_queue_purge(&tp->out_of_order_queue); +#endif + } else { // we have TID + cxgb_remove_tid(cdev, toep, tid); + toepcb_release(toep); + } +#if 0 + log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); +#endif +} + +static void +install_offload_ops(struct socket *so) +{ + struct tcpcb *tp = so_sototcpcb(so); + + KASSERT(tp->t_toe != NULL, ("toepcb not set")); + + t3_install_socket_ops(so); + tp->t_flags |= TF_TOE; + tp->t_tu = &cxgb_toe_usrreqs; +} + +/* + * Determine the receive window scaling factor given a target max + * receive window. + */ +static __inline int +select_rcv_wscale(int space) +{ + int wscale = 0; + + if (space > MAX_RCV_WND) + space = MAX_RCV_WND; + + if (tcp_do_rfc1323) + for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; + + return (wscale); +} + +/* + * Determine the receive window size for a socket. + */ +static unsigned long +select_rcv_wnd(struct toedev *dev, struct socket *so) +{ + struct tom_data *d = TOM_DATA(dev); + unsigned int wnd; + unsigned int max_rcv_wnd; + struct sockbuf *rcv; + + rcv = so_sockbuf_rcv(so); + + if (tcp_do_autorcvbuf) + wnd = tcp_autorcvbuf_max; + else + wnd = rcv->sb_hiwat; + + + + /* XXX + * For receive coalescing to work effectively we need a receive window + * that can accomodate a coalesced segment. + */ + if (wnd < MIN_RCV_WND) + wnd = MIN_RCV_WND; + + /* PR 5138 */ + max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? + (uint32_t)d->rx_page_size * 23 : + MAX_RCV_WND); + + return min(wnd, max_rcv_wnd); +} + +/* + * Assign offload parameters to some socket fields. This code is used by + * both active and passive opens. + */ +static inline void +init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, + struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); + struct sockbuf *snd, *rcv; + +#ifdef notyet + SOCK_LOCK_ASSERT(so); +#endif + + snd = so_sockbuf_snd(so); + rcv = so_sockbuf_rcv(so); + + log(LOG_INFO, "initializing offload socket\n"); + /* + * We either need to fix push frames to work with sbcompress + * or we need to add this + */ + snd->sb_flags |= SB_NOCOALESCE; + rcv->sb_flags |= SB_NOCOALESCE; + + tp->t_toe = toep; + toep->tp_tp = tp; + toep->tp_toedev = dev; + + toep->tp_tid = tid; + toep->tp_l2t = e; + toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); + toep->tp_wr_unacked = 0; + toep->tp_delack_mode = 0; + + toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); + /* + * XXX broken + * + */ + tp->rcv_wnd = select_rcv_wnd(dev, so); + + toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && + tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; + toep->tp_qset_idx = 0; + + reset_wr_list(toep); + DPRINTF("initialization done\n"); +} + +/* + * The next two functions calculate the option 0 value for a socket. + */ +static inline unsigned int +calc_opt0h(struct socket *so, int mtu_idx) +{ + struct tcpcb *tp = so_sototcpcb(so); + int wscale = select_rcv_wscale(tp->rcv_wnd); + + return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | + V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | + V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); +} + +static inline unsigned int +calc_opt0l(struct socket *so, int ulp_mode) +{ + struct tcpcb *tp = so_sototcpcb(so); + unsigned int val; + + val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | + V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); + + DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); + return (val); +} + +static inline unsigned int +calc_opt2(const struct socket *so, struct toedev *dev) +{ + int flv_valid; + + flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); + + return (V_FLAVORS_VALID(flv_valid) | + V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); +} + +#if DEBUG_WR > 1 +static int +count_pending_wrs(const struct toepcb *toep) +{ + const struct mbuf *m; + int n = 0; + + wr_queue_walk(toep, m) + n += m->m_pkthdr.csum_data; + return (n); +} +#endif + +#if 0 +(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) +#endif + +static void +mk_act_open_req(struct socket *so, struct mbuf *m, + unsigned int atid, const struct l2t_entry *e) +{ + struct cpl_act_open_req *req; + struct inpcb *inp = so_sotoinpcb(so); + struct tcpcb *tp = inp_inpcbtotcpcb(inp); + struct toepcb *toep = tp->t_toe; + struct toedev *tdev = toep->tp_toedev; + + m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); + + req = mtod(m, struct cpl_act_open_req *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = 0; + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); + inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); +#if 0 + req->local_port = inp->inp_lport; + req->peer_port = inp->inp_fport; + memcpy(&req->local_ip, &inp->inp_laddr, 4); + memcpy(&req->peer_ip, &inp->inp_faddr, 4); +#endif + req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | + V_TX_CHANNEL(e->smt_idx)); + req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); + req->params = 0; + req->opt2 = htonl(calc_opt2(so, tdev)); +} + + +/* + * Convert an ACT_OPEN_RPL status to an errno. + */ +static int +act_open_rpl_status_to_errno(int status) +{ + switch (status) { + case CPL_ERR_CONN_RESET: + return (ECONNREFUSED); + case CPL_ERR_ARP_MISS: + return (EHOSTUNREACH); + case CPL_ERR_CONN_TIMEDOUT: + return (ETIMEDOUT); + case CPL_ERR_TCAM_FULL: + return (ENOMEM); + case CPL_ERR_CONN_EXIST: + log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); + return (EADDRINUSE); + default: + return (EIO); + } +} + +static void +fail_act_open(struct toepcb *toep, int errno) +{ + struct tcpcb *tp = toep->tp_tp; + + t3_release_offload_resources(toep); + if (tp) { + inp_wunlock(tp->t_inpcb); + tcp_offload_drop(tp, errno); + } + +#ifdef notyet + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); +#endif +} + +/* + * Handle active open failures. + */ +static void +active_open_failed(struct toepcb *toep, struct mbuf *m) +{ + struct cpl_act_open_rpl *rpl = cplhdr(m); + struct inpcb *inp; + + if (toep->tp_tp == NULL) + goto done; + + inp = toep->tp_tp->t_inpcb; + +/* + * Don't handle connection retry for now + */ +#ifdef notyet + struct inet_connection_sock *icsk = inet_csk(sk); + + if (rpl->status == CPL_ERR_CONN_EXIST && + icsk->icsk_retransmit_timer.function != act_open_retry_timer) { + icsk->icsk_retransmit_timer.function = act_open_retry_timer; + sk_reset_timer(so, &icsk->icsk_retransmit_timer, + jiffies + HZ / 2); + } else +#endif + { + inp_wlock(inp); + /* + * drops the inpcb lock + */ + fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); + } + + done: + m_free(m); +} + +/* + * Return whether a failed active open has allocated a TID + */ +static inline int +act_open_has_tid(int status) +{ + return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && + status != CPL_ERR_ARP_MISS; +} + +/* + * Process an ACT_OPEN_RPL CPL message. + */ +static int +do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + struct cpl_act_open_rpl *rpl = cplhdr(m); + + if (cdev->type != T3A && act_open_has_tid(rpl->status)) + cxgb_queue_tid_release(cdev, GET_TID(rpl)); + + active_open_failed(toep, m); + return (0); +} + +/* + * Handle an ARP failure for an active open. XXX purge ofo queue + * + * XXX badly broken for crossed SYNs as the ATID is no longer valid. + * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should + * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't + * free the atid. Hmm. + */ +#ifdef notyet +static void +act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) +{ + struct toepcb *toep = m_get_toep(m); + struct tcpcb *tp = toep->tp_tp; + struct inpcb *inp = tp->t_inpcb; + struct socket *so; + + inp_wlock(inp); + if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { + /* + * drops the inpcb lock + */ + fail_act_open(so, EHOSTUNREACH); + printf("freeing %p\n", m); + + m_free(m); + } else + inp_wunlock(inp); +} +#endif +/* + * Send an active open request. + */ +int +t3_connect(struct toedev *tdev, struct socket *so, + struct rtentry *rt, struct sockaddr *nam) +{ + struct mbuf *m; + struct l2t_entry *e; + struct tom_data *d = TOM_DATA(tdev); + struct inpcb *inp = so_sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + struct toepcb *toep; /* allocated by init_offload_socket */ + + int atid; + + toep = toepcb_alloc(); + if (toep == NULL) + goto out_err; + + if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) + goto out_err; + + e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); + if (!e) + goto free_tid; + + inp_lock_assert(inp); + m = m_gethdr(MT_DATA, M_WAITOK); + +#if 0 + m->m_toe.mt_toepcb = tp->t_toe; + set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); +#endif + so_lock(so); + + init_offload_socket(so, tdev, atid, e, rt, toep); + + install_offload_ops(so); + + mk_act_open_req(so, m, atid, e); + so_unlock(so); + + soisconnecting(so); + toep = tp->t_toe; + m_set_toep(m, tp->t_toe); + + toep->tp_state = TCPS_SYN_SENT; + l2t_send(d->cdev, (struct mbuf *)m, e); + + if (toep->tp_ulp_mode) + t3_enable_ddp(toep, 0); + return (0); + +free_tid: + printf("failing connect - free atid\n"); + + free_atid(d->cdev, atid); +out_err: + printf("return ENOMEM\n"); + return (ENOMEM); +} + +/* + * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do + * not send multiple ABORT_REQs for the same connection and also that we do + * not try to send a message after the connection has closed. Returns 1 if + * an ABORT_REQ wasn't generated after all, 0 otherwise. + */ +static void +t3_send_reset(struct toepcb *toep) +{ + + struct cpl_abort_req *req; + unsigned int tid = toep->tp_tid; + int mode = CPL_ABORT_SEND_RST; + struct tcpcb *tp = toep->tp_tp; + struct toedev *tdev = toep->tp_toedev; + struct socket *so = NULL; + struct mbuf *m; + struct sockbuf *snd; + + if (tp) { + inp_lock_assert(tp->t_inpcb); + so = inp_inpcbtosocket(tp->t_inpcb); + } + + if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || + tdev == NULL)) + return; + toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); + + snd = so_sockbuf_snd(so); + /* Purge the send queue so we don't send anything after an abort. */ + if (so) + sbflush(snd); + if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) + mode |= CPL_ABORT_POST_CLOSE_REQ; + + m = m_gethdr_nofail(sizeof(*req)); + m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); + set_arp_failure_handler(m, abort_arp_failure); + + req = mtod(m, struct cpl_abort_req *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); + req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; + req->rsvd1 = !(toep->tp_flags & TP_DATASENT); + req->cmd = mode; + if (tp && (tp->t_state == TCPS_SYN_SENT)) + mbufq_tail(&toep->out_of_order_queue, m); // defer + else + l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); +} + +static int +t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) +{ + struct inpcb *inp; + int error, optval; + + if (sopt->sopt_name == IP_OPTIONS) + return (ENOPROTOOPT); + + if (sopt->sopt_name != IP_TOS) + return (EOPNOTSUPP); + + error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); + + if (error) + return (error); + + if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread)) + return (EPERM); + + inp = so_sotoinpcb(so); + inp_wlock(inp); + inp_ip_tos_set(inp, optval); +#if 0 + inp->inp_ip_tos = optval; +#endif + t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); + inp_wunlock(inp); + + return (0); +} + +static int +t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) +{ + int err = 0; + size_t copied; + + if (sopt->sopt_name != TCP_CONGESTION && + sopt->sopt_name != TCP_NODELAY) + return (EOPNOTSUPP); + + if (sopt->sopt_name == TCP_CONGESTION) { + char name[TCP_CA_NAME_MAX]; + int optlen = sopt->sopt_valsize; + struct tcpcb *tp; + + if (sopt->sopt_dir == SOPT_GET) { + KASSERT(0, ("unimplemented")); + return (EOPNOTSUPP); + } + + if (optlen < 1) + return (EINVAL); + + err = copyinstr(sopt->sopt_val, name, + min(TCP_CA_NAME_MAX - 1, optlen), &copied); + if (err) + return (err); + if (copied < 1) + return (EINVAL); + + tp = so_sototcpcb(so); + /* + * XXX I need to revisit this + */ + if ((err = t3_set_cong_control(so, name)) == 0) { +#ifdef CONGESTION_CONTROL_SUPPORTED + tp->t_cong_control = strdup(name, M_CXGB); +#endif + } else + return (err); + } else { + int optval, oldval; + struct inpcb *inp; + struct tcpcb *tp; + + if (sopt->sopt_dir == SOPT_GET) + return (EOPNOTSUPP); + + err = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + + if (err) + return (err); + + inp = so_sotoinpcb(so); + tp = inp_inpcbtotcpcb(inp); + + inp_wlock(inp); + + oldval = tp->t_flags; + if (optval) + tp->t_flags |= TF_NODELAY; + else + tp->t_flags &= ~TF_NODELAY; + inp_wunlock(inp); + + + if (oldval != tp->t_flags && (tp->t_toe != NULL)) + t3_set_nagle(tp->t_toe); + + } + + return (0); +} + +int +t3_ctloutput(struct socket *so, struct sockopt *sopt) +{ + int err; + + if (sopt->sopt_level != IPPROTO_TCP) + err = t3_ip_ctloutput(so, sopt); + else + err = t3_tcp_ctloutput(so, sopt); + + if (err != EOPNOTSUPP) + return (err); + + return (tcp_ctloutput(so, sopt)); +} + +/* + * Returns true if we need to explicitly request RST when we receive new data + * on an RX-closed connection. + */ +static inline int +need_rst_on_excess_rx(const struct toepcb *toep) +{ + return (1); +} + +/* + * Handles Rx data that arrives in a state where the socket isn't accepting + * new data. + */ +static void +handle_excess_rx(struct toepcb *toep, struct mbuf *m) +{ + + if (need_rst_on_excess_rx(toep) && + !(toep->tp_flags & TP_ABORT_SHUTDOWN)) + t3_send_reset(toep); + m_freem(m); +} + +/* + * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) + * by getting the DDP offset from the TCB. + */ +static void +tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) +{ + struct ddp_state *q = &toep->tp_ddp_state; + struct ddp_buf_state *bsp; + struct cpl_get_tcb_rpl *hdr; + unsigned int ddp_offset; + struct socket *so; + struct tcpcb *tp; + struct sockbuf *rcv; + int state; + + uint64_t t; + __be64 *tcb; + + tp = toep->tp_tp; + so = inp_inpcbtosocket(tp->t_inpcb); + + inp_lock_assert(tp->t_inpcb); + rcv = so_sockbuf_rcv(so); + sockbuf_lock(rcv); + + /* Note that we only accout for CPL_GET_TCB issued by the DDP code. + * We really need a cookie in order to dispatch the RPLs. + */ + q->get_tcb_count--; + + /* It is a possible that a previous CPL already invalidated UBUF DDP + * and moved the cur_buf idx and hence no further processing of this + * skb is required. However, the app might be sleeping on + * !q->get_tcb_count and we need to wake it up. + */ + if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { + int state = so_state_get(so); + + m_freem(m); + if (__predict_true((state & SS_NOFDREF) == 0)) + so_sorwakeup_locked(so); + else + sockbuf_unlock(rcv); + + return; + } + + bsp = &q->buf_state[q->cur_buf]; + hdr = cplhdr(m); + tcb = (__be64 *)(hdr + 1); + if (q->cur_buf == 0) { + t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); + ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); + } else { + t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); + ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; + } + ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; + m->m_cur_offset = bsp->cur_offset; + bsp->cur_offset = ddp_offset; + m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; + + CTR5(KTR_TOM, + "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", + q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); + KASSERT(ddp_offset >= m->m_cur_offset, + ("ddp_offset=%u less than cur_offset=%u", + ddp_offset, m->m_cur_offset)); + +#if 0 +{ + unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; + + t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); + ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; + + t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); + rcv_nxt = t >> S_TCB_RCV_NXT; + rcv_nxt &= M_TCB_RCV_NXT; + + t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); + rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); + rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; + + T3_TRACE2(TIDTB(sk), + "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", + ddp_flags, rcv_nxt - rx_hdr_offset); + T3_TRACE4(TB(q), + "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", + tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); + T3_TRACE3(TB(q), + "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", + rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); + T3_TRACE2(TB(q), + "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", + q->buf_state[0].flags, q->buf_state[1].flags); + +} +#endif + if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { + handle_excess_rx(toep, m); + return; + } + +#ifdef T3_TRACE + if ((int)m->m_pkthdr.len < 0) { + t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); + } +#endif + if (bsp->flags & DDP_BF_NOCOPY) { +#ifdef T3_TRACE + T3_TRACE0(TB(q), + "tcb_rpl_as_ddp_complete: CANCEL UBUF"); + + if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + printk("!cancel_ubuf"); + t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); + } +#endif + m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; + bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); + q->cur_buf ^= 1; + } else if (bsp->flags & DDP_BF_NOFLIP) { + + m->m_ddp_flags = 1; /* always a kernel buffer */ + + /* now HW buffer carries a user buffer */ + bsp->flags &= ~DDP_BF_NOFLIP; + bsp->flags |= DDP_BF_NOCOPY; + + /* It is possible that the CPL_GET_TCB_RPL doesn't indicate + * any new data in which case we're done. If in addition the + * offset is 0, then there wasn't a completion for the kbuf + * and we need to decrement the posted count. + */ + if (m->m_pkthdr.len == 0) { + if (ddp_offset == 0) { + q->kbuf_posted--; + bsp->flags |= DDP_BF_NODATA; + } + sockbuf_unlock(rcv); + m_free(m); + return; + } + } else { + sockbuf_unlock(rcv); + + /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, + * but it got here way late and nobody cares anymore. + */ + m_free(m); + return; + } + + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; + m->m_seq = tp->rcv_nxt; + tp->rcv_nxt += m->m_pkthdr.len; + tp->t_rcvtime = ticks; + CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", + m->m_seq, q->cur_buf, m->m_pkthdr.len); + if (m->m_pkthdr.len == 0) { + q->user_ddp_pending = 0; + m_free(m); + } else + SBAPPEND(rcv, m); + + state = so_state_get(so); + if (__predict_true((state & SS_NOFDREF) == 0)) + so_sorwakeup_locked(so); + else + sockbuf_unlock(rcv); +} + +/* + * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, + * in that case they are similar to DDP completions. + */ +static int +do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + /* OK if socket doesn't exist */ + if (toep == NULL) { + printf("null toep in do_get_tcb_rpl\n"); + return (CPL_RET_BUF_DONE); + } + + inp_wlock(toep->tp_tp->t_inpcb); + tcb_rpl_as_ddp_complete(toep, m); + inp_wunlock(toep->tp_tp->t_inpcb); + + return (0); +} + +static void +handle_ddp_data(struct toepcb *toep, struct mbuf *m) +{ + struct tcpcb *tp = toep->tp_tp; + struct socket *so; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_rx_data *hdr = cplhdr(m); + unsigned int rcv_nxt = ntohl(hdr->seq); + struct sockbuf *rcv; + + if (tp->rcv_nxt == rcv_nxt) + return; + + inp_lock_assert(tp->t_inpcb); + so = inp_inpcbtosocket(tp->t_inpcb); + rcv = so_sockbuf_rcv(so); + sockbuf_lock(rcv); + + q = &toep->tp_ddp_state; + bsp = &q->buf_state[q->cur_buf]; + KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", + rcv_nxt, tp->rcv_nxt)); + m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; + KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); + CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", + rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); + +#ifdef T3_TRACE + if ((int)m->m_pkthdr.len < 0) { + t3_ddp_error(so, "handle_ddp_data: neg len"); + } +#endif + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; + m->m_cur_offset = bsp->cur_offset; + m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; + if (bsp->flags & DDP_BF_NOCOPY) + bsp->flags &= ~DDP_BF_NOCOPY; + + m->m_seq = tp->rcv_nxt; + tp->rcv_nxt = rcv_nxt; + bsp->cur_offset += m->m_pkthdr.len; + if (!(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; + /* + * For now, don't re-enable DDP after a connection fell out of DDP + * mode. + */ + q->ubuf_ddp_ready = 0; + sockbuf_unlock(rcv); +} + +/* + * Process new data received for a connection. + */ +static void +new_rx_data(struct toepcb *toep, struct mbuf *m) +{ + struct cpl_rx_data *hdr = cplhdr(m); + struct tcpcb *tp = toep->tp_tp; + struct socket *so; + struct sockbuf *rcv; + int state; + int len = be16toh(hdr->len); + + inp_wlock(tp->t_inpcb); + + so = inp_inpcbtosocket(tp->t_inpcb); + + if (__predict_false(so_no_receive(so))) { + handle_excess_rx(toep, m); + inp_wunlock(tp->t_inpcb); + TRACE_EXIT; + return; + } + + if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) + handle_ddp_data(toep, m); + + m->m_seq = ntohl(hdr->seq); + m->m_ulp_mode = 0; /* for iSCSI */ + +#if VALIDATE_SEQ + if (__predict_false(m->m_seq != tp->rcv_nxt)) { + log(LOG_ERR, + "%s: TID %u: Bad sequence number %u, expected %u\n", + toep->tp_toedev->name, toep->tp_tid, m->m_seq, + tp->rcv_nxt); + m_freem(m); + inp_wunlock(tp->t_inpcb); + return; + } +#endif + m_adj(m, sizeof(*hdr)); + +#ifdef URGENT_DATA_SUPPORTED + /* + * We don't handle urgent data yet + */ + if (__predict_false(hdr->urg)) + handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); + if (__predict_false(tp->urg_data == TCP_URG_NOTYET && + tp->urg_seq - tp->rcv_nxt < skb->len)) + tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - + tp->rcv_nxt]; +#endif + if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { + toep->tp_delack_mode = hdr->dack_mode; + toep->tp_delack_seq = tp->rcv_nxt; + } + CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", + m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); + + if (len < m->m_pkthdr.len) + m->m_pkthdr.len = m->m_len = len; + + tp->rcv_nxt += m->m_pkthdr.len; + tp->t_rcvtime = ticks; + toep->tp_enqueued_bytes += m->m_pkthdr.len; + CTR2(KTR_TOM, + "new_rx_data: seq 0x%x len %u", + m->m_seq, m->m_pkthdr.len); + inp_wunlock(tp->t_inpcb); + rcv = so_sockbuf_rcv(so); + sockbuf_lock(rcv); +#if 0 + if (sb_notify(rcv)) + DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); +#endif + SBAPPEND(rcv, m); + +#ifdef notyet + /* + * We're giving too many credits to the card - but disable this check so we can keep on moving :-| + * + */ + KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), + + ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", + so, rcv->sb_cc, rcv->sb_mbmax)); +#endif + + + CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", + rcv->sb_cc, rcv->sb_mbcnt); + + state = so_state_get(so); + if (__predict_true((state & SS_NOFDREF) == 0)) + so_sorwakeup_locked(so); + else + sockbuf_unlock(rcv); +} + +/* + * Handler for RX_DATA CPL messages. + */ +static int +do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); + + new_rx_data(toep, m); + + return (0); +} + +static void +new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) +{ + struct tcpcb *tp; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_rx_data_ddp *hdr; + struct socket *so; + unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; + int nomoredata = 0; + unsigned int delack_mode; + struct sockbuf *rcv; + + tp = toep->tp_tp; + inp_wlock(tp->t_inpcb); + so = inp_inpcbtosocket(tp->t_inpcb); + + if (__predict_false(so_no_receive(so))) { + + handle_excess_rx(toep, m); + inp_wunlock(tp->t_inpcb); + return; + } + + q = &toep->tp_ddp_state; + hdr = cplhdr(m); + ddp_report = ntohl(hdr->u.ddp_report); + buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; + bsp = &q->buf_state[buf_idx]; + + CTR4(KTR_TOM, + "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " + "hdr seq 0x%x len %u", + tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), + ntohs(hdr->len)); + CTR3(KTR_TOM, + "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", + G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); + + ddp_len = ntohs(hdr->len); + rcv_nxt = ntohl(hdr->seq) + ddp_len; + + delack_mode = G_DDP_DACK_MODE(ddp_report); + if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { + toep->tp_delack_mode = delack_mode; + toep->tp_delack_seq = tp->rcv_nxt; + } + + m->m_seq = tp->rcv_nxt; + tp->rcv_nxt = rcv_nxt; + + tp->t_rcvtime = ticks; + /* + * Store the length in m->m_len. We are changing the meaning of + * m->m_len here, we need to be very careful that nothing from now on + * interprets ->len of this packet the usual way. + */ + m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; + inp_wunlock(tp->t_inpcb); + CTR3(KTR_TOM, + "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", + m->m_len, rcv_nxt, m->m_seq); + /* + * Figure out where the new data was placed in the buffer and store it + * in when. Assumes the buffer offset starts at 0, consumer needs to + * account for page pod's pg_offset. + */ + end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; + m->m_cur_offset = end_offset - m->m_pkthdr.len; + + rcv = so_sockbuf_rcv(so); + sockbuf_lock(rcv); + + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; + bsp->cur_offset = end_offset; + toep->tp_enqueued_bytes += m->m_pkthdr.len; + + /* + * Length is only meaningful for kbuf + */ + if (!(bsp->flags & DDP_BF_NOCOPY)) + KASSERT(m->m_len <= bsp->gl->dgl_length, + ("length received exceeds ddp pages: len=%d dgl_length=%d", + m->m_len, bsp->gl->dgl_length)); + + KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); + KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); + /* + * Bit 0 of flags stores whether the DDP buffer is completed. + * Note that other parts of the code depend on this being in bit 0. + */ + if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { + panic("spurious ddp completion"); + } else { + m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); + if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; /* flip buffers */ + } + + if (bsp->flags & DDP_BF_NOCOPY) { + m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); + bsp->flags &= ~DDP_BF_NOCOPY; + } + + if (ddp_report & F_DDP_PSH) + m->m_ddp_flags |= DDP_BF_PSH; + if (nomoredata) + m->m_ddp_flags |= DDP_BF_NODATA; + +#ifdef notyet + skb_reset_transport_header(skb); + tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ +#endif + SBAPPEND(rcv, m); + + if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || + (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) + || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) + so_sorwakeup_locked(so); + else + sockbuf_unlock(rcv); +} + +#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ + F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ + F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ + F_DDP_INVALID_PPOD) + +/* + * Handler for RX_DATA_DDP CPL messages. + */ +static int +do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = ctx; + const struct cpl_rx_data_ddp *hdr = cplhdr(m); + + VALIDATE_SOCK(so); + + if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { + log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", + GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); + return (CPL_RET_BUF_DONE); + } +#if 0 + skb->h.th = tcphdr_skb->h.th; +#endif + new_rx_data_ddp(toep, m); + return (0); +} + +static void +process_ddp_complete(struct toepcb *toep, struct mbuf *m) +{ + struct tcpcb *tp = toep->tp_tp; + struct socket *so; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_rx_ddp_complete *hdr; + unsigned int ddp_report, buf_idx, when, delack_mode; + int nomoredata = 0; + struct sockbuf *rcv; + + inp_wlock(tp->t_inpcb); + so = inp_inpcbtosocket(tp->t_inpcb); + + if (__predict_false(so_no_receive(so))) { + struct inpcb *inp = so_sotoinpcb(so); + + handle_excess_rx(toep, m); + inp_wunlock(inp); + return; + } + q = &toep->tp_ddp_state; + hdr = cplhdr(m); + ddp_report = ntohl(hdr->ddp_report); + buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; + m->m_pkthdr.csum_data = tp->rcv_nxt; + + rcv = so_sockbuf_rcv(so); + sockbuf_lock(rcv); + + bsp = &q->buf_state[buf_idx]; + when = bsp->cur_offset; + m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; + tp->rcv_nxt += m->m_len; + tp->t_rcvtime = ticks; + + delack_mode = G_DDP_DACK_MODE(ddp_report); + if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { + toep->tp_delack_mode = delack_mode; + toep->tp_delack_seq = tp->rcv_nxt; + } +#ifdef notyet + skb_reset_transport_header(skb); + tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ +#endif + inp_wunlock(tp->t_inpcb); + + KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); + CTR5(KTR_TOM, + "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " + "ddp_report 0x%x offset %u, len %u", + tp->rcv_nxt, bsp->cur_offset, ddp_report, + G_DDP_OFFSET(ddp_report), m->m_len); + + m->m_cur_offset = bsp->cur_offset; + bsp->cur_offset += m->m_len; + + if (!(bsp->flags & DDP_BF_NOFLIP)) { + q->cur_buf ^= 1; /* flip buffers */ + if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) + nomoredata=1; + } + + CTR4(KTR_TOM, + "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " + "ddp_report %u offset %u", + tp->rcv_nxt, bsp->cur_offset, ddp_report, + G_DDP_OFFSET(ddp_report)); + + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; + m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; + if (bsp->flags & DDP_BF_NOCOPY) + bsp->flags &= ~DDP_BF_NOCOPY; + if (nomoredata) + m->m_ddp_flags |= DDP_BF_NODATA; + + SBAPPEND(rcv, m); + if ((so_state_get(so) & SS_NOFDREF) == 0) + so_sorwakeup_locked(so); + else + sockbuf_unlock(rcv); +} + +/* + * Handler for RX_DDP_COMPLETE CPL messages. + */ +static int +do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = ctx; + + VALIDATE_SOCK(so); +#if 0 + skb->h.th = tcphdr_skb->h.th; +#endif + process_ddp_complete(toep, m); + return (0); +} + +/* + * Move a socket to TIME_WAIT state. We need to make some adjustments to the + * socket state before calling tcp_time_wait to comply with its expectations. + */ +static void +enter_timewait(struct tcpcb *tp) +{ + /* + * Bump rcv_nxt for the peer FIN. We don't do this at the time we + * process peer_close because we don't want to carry the peer FIN in + * the socket's receive queue and if we increment rcv_nxt without + * having the FIN in the receive queue we'll confuse facilities such + * as SIOCINQ. + */ + inp_wlock(tp->t_inpcb); + tp->rcv_nxt++; + + tp->ts_recent_age = 0; /* defeat recycling */ + tp->t_srtt = 0; /* defeat tcp_update_metrics */ + inp_wunlock(tp->t_inpcb); + tcp_offload_twstart(tp); +} + +/* + * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This + * function deals with the data that may be reported along with the FIN. + * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to + * perform normal FIN-related processing. In the latter case 1 indicates that + * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the + * skb can be freed. + */ +static int +handle_peer_close_data(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_peer_close *req = cplhdr(m); + unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ + struct sockbuf *rcv; + + if (tp->rcv_nxt == rcv_nxt) /* no data */ + return (0); + + CTR0(KTR_TOM, "handle_peer_close_data"); + if (__predict_false(so_no_receive(so))) { + handle_excess_rx(toep, m); + + /* + * Although we discard the data we want to process the FIN so + * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + + * PEER_CLOSE without data. In particular this PEER_CLOSE + * may be what will close the connection. We return 1 because + * handle_excess_rx() already freed the packet. + */ + return (1); + } + + inp_lock_assert(tp->t_inpcb); + q = &toep->tp_ddp_state; + rcv = so_sockbuf_rcv(so); + sockbuf_lock(rcv); + + bsp = &q->buf_state[q->cur_buf]; + m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; + KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_flags |= M_DDP; + m->m_cur_offset = bsp->cur_offset; + m->m_ddp_flags = + DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; + m->m_seq = tp->rcv_nxt; + tp->rcv_nxt = rcv_nxt; + bsp->cur_offset += m->m_pkthdr.len; + if (!(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; +#ifdef notyet + skb_reset_transport_header(skb); + tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ +#endif + tp->t_rcvtime = ticks; + SBAPPEND(rcv, m); + if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) + so_sorwakeup_locked(so); + else + sockbuf_unlock(rcv); + + return (1); +} + +/* + * Handle a peer FIN. + */ +static void +do_peer_fin(struct toepcb *toep, struct mbuf *m) +{ + struct socket *so; + struct tcpcb *tp = toep->tp_tp; + int keep, action; + + action = keep = 0; + CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); + if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { + printf("abort_pending set\n"); + + goto out; + } + inp_wlock(tp->t_inpcb); + so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); + if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { + keep = handle_peer_close_data(so, m); + if (keep < 0) { + inp_wunlock(tp->t_inpcb); + return; + } + } + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + CTR1(KTR_TOM, + "waking up waiters for cantrcvmore on %p ", so); + socantrcvmore(so); + + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (tp->t_flags & TF_NEEDSYN) + tp->t_flags |= TF_DELACK; + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /* FALLTHROUGH */ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + case TCPS_FIN_WAIT_2: + /* + * If we've sent an abort_req we must have sent it too late, + * HW will send us a reply telling us so, and this peer_close + * is really the last message for this connection and needs to + * be treated as an abort_rpl, i.e., transition the connection + * to TCP_CLOSE (note that the host stack does this at the + * time of generating the RST but we must wait for HW). + * Otherwise we enter TIME_WAIT. + */ + t3_release_offload_resources(toep); + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + action = TCP_CLOSE; + } else { + action = TCP_TIMEWAIT; + } + break; + default: + log(LOG_ERR, + "%s: TID %u received PEER_CLOSE in bad state %d\n", + toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); + } + inp_wunlock(tp->t_inpcb); + + if (action == TCP_TIMEWAIT) { + enter_timewait(tp); + } else if (action == TCP_DROP) { + tcp_offload_drop(tp, 0); + } else if (action == TCP_CLOSE) { + tcp_offload_close(tp); + } + +#ifdef notyet + /* Do not send POLL_HUP for half duplex close. */ + if ((sk->sk_shutdown & SEND_SHUTDOWN) || + sk->sk_state == TCP_CLOSE) + sk_wake_async(so, 1, POLL_HUP); + else + sk_wake_async(so, 1, POLL_IN); +#endif + +out: + if (!keep) + m_free(m); +} + +/* + * Handler for PEER_CLOSE CPL messages. + */ +static int +do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + VALIDATE_SOCK(so); + + do_peer_fin(toep, m); + return (0); +} + +static void +process_close_con_rpl(struct toepcb *toep, struct mbuf *m) +{ + struct cpl_close_con_rpl *rpl = cplhdr(m); + struct tcpcb *tp = toep->tp_tp; + struct socket *so; + int action = 0; + struct sockbuf *rcv; + + inp_wlock(tp->t_inpcb); + so = inp_inpcbtosocket(tp->t_inpcb); + + tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ + + if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { + inp_wunlock(tp->t_inpcb); + goto out; + } + + CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, + tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); + + switch (tp->t_state) { + case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ + t3_release_offload_resources(toep); + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + action = TCP_CLOSE; + + } else { + action = TCP_TIMEWAIT; + } + break; + case TCPS_LAST_ACK: + /* + * In this state we don't care about pending abort_rpl. + * If we've sent abort_req it was post-close and was sent too + * late, this close_con_rpl is the actual last message. + */ + t3_release_offload_resources(toep); + action = TCP_CLOSE; + break; + case TCPS_FIN_WAIT_1: + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + * + * XXXjl: + * we should release the tp also, and use a + * compressed state. + */ + if (so) + rcv = so_sockbuf_rcv(so); + else + break; + + if (rcv->sb_state & SBS_CANTRCVMORE) { + int timeout; + + if (so) + soisdisconnected(so); + timeout = (tcp_fast_finwait2_recycle) ? + tcp_finwait2_timeout : tcp_maxidle; + tcp_timer_activate(tp, TT_2MSL, timeout); + } + tp->t_state = TCPS_FIN_WAIT_2; + if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && + (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { + action = TCP_DROP; + } + + break; + default: + log(LOG_ERR, + "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", + toep->tp_toedev->tod_name, toep->tp_tid, + tp->t_state); + } + inp_wunlock(tp->t_inpcb); + + + if (action == TCP_TIMEWAIT) { + enter_timewait(tp); + } else if (action == TCP_DROP) { + tcp_offload_drop(tp, 0); + } else if (action == TCP_CLOSE) { + tcp_offload_close(tp); + } +out: + m_freem(m); +} + +/* + * Handler for CLOSE_CON_RPL CPL messages. + */ +static int +do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, + void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + process_close_con_rpl(toep, m); + return (0); +} + +/* + * Process abort replies. We only process these messages if we anticipate + * them as the coordination between SW and HW in this area is somewhat lacking + * and sometimes we get ABORT_RPLs after we are done with the connection that + * originated the ABORT_REQ. + */ +static void +process_abort_rpl(struct toepcb *toep, struct mbuf *m) +{ + struct tcpcb *tp = toep->tp_tp; + struct socket *so; + int needclose = 0; + +#ifdef T3_TRACE + T3_TRACE1(TIDTB(sk), + "process_abort_rpl: GTS rpl pending %d", + sock_flag(sk, ABORT_RPL_PENDING)); +#endif + + inp_wlock(tp->t_inpcb); + so = inp_inpcbtosocket(tp->t_inpcb); + + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + /* + * XXX panic on tcpdrop + */ + if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) + toep->tp_flags |= TP_ABORT_RPL_RCVD; + else { + toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); + if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || + !is_t3a(toep->tp_toedev)) { + if (toep->tp_flags & TP_ABORT_REQ_RCVD) + panic("TP_ABORT_REQ_RCVD set"); + t3_release_offload_resources(toep); + needclose = 1; + } + } + } + inp_wunlock(tp->t_inpcb); + + if (needclose) + tcp_offload_close(tp); + + m_free(m); +} + +/* + * Handle an ABORT_RPL_RSS CPL message. + */ +static int +do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct cpl_abort_rpl_rss *rpl = cplhdr(m); + struct toepcb *toep; + + /* + * Ignore replies to post-close aborts indicating that the abort was + * requested too late. These connections are terminated when we get + * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss + * arrives the TID is either no longer used or it has been recycled. + */ + if (rpl->status == CPL_ERR_ABORT_FAILED) { +discard: + m_free(m); + return (0); + } + + toep = (struct toepcb *)ctx; + + /* + * Sometimes we've already closed the socket, e.g., a post-close + * abort races with ABORT_REQ_RSS, the latter frees the socket + * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, + * but FW turns the ABORT_REQ into a regular one and so we get + * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. + */ + if (!toep) + goto discard; + + if (toep->tp_tp == NULL) { + log(LOG_NOTICE, "removing tid for abort\n"); + cxgb_remove_tid(cdev, toep, toep->tp_tid); + if (toep->tp_l2t) + l2t_release(L2DATA(cdev), toep->tp_l2t); + + toepcb_release(toep); + goto discard; + } + + log(LOG_NOTICE, "toep=%p\n", toep); + log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); + + toepcb_hold(toep); + process_abort_rpl(toep, m); + toepcb_release(toep); + return (0); +} + +/* + * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also + * indicate whether RST should be sent in response. + */ +static int +abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) +{ + struct tcpcb *tp = so_sototcpcb(so); + + switch (abort_reason) { + case CPL_ERR_BAD_SYN: +#if 0 + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through +#endif + case CPL_ERR_CONN_RESET: + // XXX need to handle SYN_RECV due to crossed SYNs + return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); + case CPL_ERR_XMIT_TIMEDOUT: + case CPL_ERR_PERSIST_TIMEDOUT: + case CPL_ERR_FINWAIT2_TIMEDOUT: + case CPL_ERR_KEEPALIVE_TIMEDOUT: +#if 0 + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); +#endif + return (ETIMEDOUT); + default: + return (EIO); + } +} + +static inline void +set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) +{ + struct cpl_abort_rpl *rpl = cplhdr(m); + + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(tid)); + m->m_len = m->m_pkthdr.len = sizeof(*rpl); + + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); + rpl->cmd = cmd; +} + +static void +send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) +{ + struct mbuf *reply_mbuf; + struct cpl_abort_req_rss *req = cplhdr(m); + + reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); + m_set_priority(m, CPL_PRIORITY_DATA); + m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); + set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); + m_free(m); +} + +/* + * Returns whether an ABORT_REQ_RSS message is a negative advice. + */ +static inline int +is_neg_adv_abort(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE; +} + +static void +send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) +{ + struct mbuf *reply_mbuf; + struct cpl_abort_req_rss *req = cplhdr(m); + + reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); + + if (!reply_mbuf) { + /* Defer the reply. Stick rst_status into req->cmd. */ + req->status = rst_status; + t3_defer_reply(m, tdev, send_deferred_abort_rpl); + return; + } + + m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); + set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); + m_free(m); + + /* + * XXX need to sync with ARP as for SYN_RECV connections we can send + * these messages while ARP is pending. For other connection states + * it's not a problem. + */ + cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); +} + +#ifdef notyet +static void +cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) +{ + CXGB_UNIMPLEMENTED(); +#ifdef notyet + struct request_sock *req = child->sk_user_data; + + inet_csk_reqsk_queue_removed(parent, req); + synq_remove(tcp_sk(child)); + __reqsk_free(req); + child->sk_user_data = NULL; +#endif +} + + +/* + * Performs the actual work to abort a SYN_RECV connection. + */ +static void +do_abort_syn_rcv(struct socket *child, struct socket *parent) +{ + struct tcpcb *parenttp = so_sototcpcb(parent); + struct tcpcb *childtp = so_sototcpcb(child); + + /* + * If the server is still open we clean up the child connection, + * otherwise the server already did the clean up as it was purging + * its SYN queue and the skb was just sitting in its backlog. + */ + if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { + cleanup_syn_rcv_conn(child, parent); + inp_wlock(childtp->t_inpcb); + t3_release_offload_resources(childtp->t_toe); + inp_wunlock(childtp->t_inpcb); + tcp_offload_close(childtp); + } +} +#endif + +/* + * Handle abort requests for a SYN_RECV connection. These need extra work + * because the socket is on its parent's SYN queue. + */ +static int +abort_syn_rcv(struct socket *so, struct mbuf *m) +{ + CXGB_UNIMPLEMENTED(); +#ifdef notyet + struct socket *parent; + struct toedev *tdev = toep->tp_toedev; + struct t3cdev *cdev = TOM_DATA(tdev)->cdev; + struct socket *oreq = so->so_incomp; + struct t3c_tid_entry *t3c_stid; + struct tid_info *t; + + if (!oreq) + return -1; /* somehow we are not on the SYN queue */ + + t = &(T3C_DATA(cdev))->tid_maps; + t3c_stid = lookup_stid(t, oreq->ts_recent); + parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; + + so_lock(parent); + do_abort_syn_rcv(so, parent); + send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); + so_unlock(parent); +#endif + return (0); +} + +/* + * Process abort requests. If we are waiting for an ABORT_RPL we ignore this + * request except that we need to reply to it. + */ +static void +process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) +{ + int rst_status = CPL_ABORT_NO_RST; + const struct cpl_abort_req_rss *req = cplhdr(m); + struct tcpcb *tp = toep->tp_tp; + struct socket *so; + int needclose = 0; + + inp_wlock(tp->t_inpcb); + so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); + if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { + toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); + m_free(m); + goto skip; + } + + toep->tp_flags &= ~TP_ABORT_REQ_RCVD; + /* + * Three cases to consider: + * a) We haven't sent an abort_req; close the connection. + * b) We have sent a post-close abort_req that will get to TP too late + * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will + * be ignored and the connection should be closed now. + * c) We have sent a regular abort_req that will get to TP too late. + * That will generate an abort_rpl with status 0, wait for it. + */ + if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || + (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { + int error; + + error = abort_status_to_errno(so, req->status, + &rst_status); + so_error_set(so, error); + + if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) + so_sorwakeup(so); + /* + * SYN_RECV needs special processing. If abort_syn_rcv() + * returns 0 is has taken care of the abort. + */ + if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) + goto skip; + + t3_release_offload_resources(toep); + needclose = 1; + } + inp_wunlock(tp->t_inpcb); + + if (needclose) + tcp_offload_close(tp); + + send_abort_rpl(m, tdev, rst_status); + return; +skip: + inp_wunlock(tp->t_inpcb); +} + +/* + * Handle an ABORT_REQ_RSS CPL message. + */ +static int +do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + const struct cpl_abort_req_rss *req = cplhdr(m); + struct toepcb *toep = (struct toepcb *)ctx; + + if (is_neg_adv_abort(req->status)) { + m_free(m); + return (0); + } + + log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); + + if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { + cxgb_remove_tid(cdev, toep, toep->tp_tid); + toep->tp_flags |= TP_ABORT_REQ_RCVD; + + send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); + if (toep->tp_l2t) + l2t_release(L2DATA(cdev), toep->tp_l2t); + + /* + * Unhook + */ + toep->tp_tp->t_toe = NULL; + toep->tp_tp->t_flags &= ~TF_TOE; + toep->tp_tp = NULL; + /* + * XXX need to call syncache_chkrst - but we don't + * have a way of doing that yet + */ + toepcb_release(toep); + log(LOG_ERR, "abort for unestablished connection :-(\n"); + return (0); + } + if (toep->tp_tp == NULL) { + log(LOG_NOTICE, "disconnected toepcb\n"); + /* should be freed momentarily */ + return (0); + } + + + toepcb_hold(toep); + process_abort_req(toep, m, toep->tp_toedev); + toepcb_release(toep); + return (0); +} +#ifdef notyet +static void +pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) +{ + struct toedev *tdev = TOE_DEV(parent); + + do_abort_syn_rcv(child, parent); + if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { + struct cpl_pass_accept_rpl *rpl = cplhdr(m); + + rpl->opt0h = htonl(F_TCAM_BYPASS); + rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); + } else + m_free(m); +} +#endif +static void +handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) +{ + CXGB_UNIMPLEMENTED(); + +#ifdef notyet + struct t3cdev *cdev; + struct socket *parent; + struct socket *oreq; + struct t3c_tid_entry *t3c_stid; + struct tid_info *t; + struct tcpcb *otp, *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + /* + * If the connection is being aborted due to the parent listening + * socket going away there's nothing to do, the ABORT_REQ will close + * the connection. + */ + if (toep->tp_flags & TP_ABORT_RPL_PENDING) { + m_free(m); + return; + } + + oreq = so->so_incomp; + otp = so_sototcpcb(oreq); + + cdev = T3C_DEV(so); + t = &(T3C_DATA(cdev))->tid_maps; + t3c_stid = lookup_stid(t, otp->ts_recent); + parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; + + so_lock(parent); + pass_open_abort(so, parent, m); + so_unlock(parent); +#endif +} + +/* + * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly + * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV + * connection. + */ +static void +pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) +{ + +#ifdef notyet + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); +#endif + handle_pass_open_arp_failure(m_get_socket(m), m); +} + +/* + * Populate a reject CPL_PASS_ACCEPT_RPL WR. + */ +static void +mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) +{ + struct cpl_pass_accept_req *req = cplhdr(req_mbuf); + struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); + unsigned int tid = GET_TID(req); + + m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); + rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet + rpl->opt0h = htonl(F_TCAM_BYPASS); + rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); + rpl->opt2 = 0; + rpl->rsvd = rpl->opt2; /* workaround for HW bug */ +} + +/* + * Send a deferred reject to an accept request. + */ +static void +reject_pass_request(struct toedev *tdev, struct mbuf *m) +{ + struct mbuf *reply_mbuf; + + reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); + mk_pass_accept_rpl(reply_mbuf, m); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); + m_free(m); +} + +static void +handle_syncache_event(int event, void *arg) +{ + struct toepcb *toep = arg; + + switch (event) { + case TOE_SC_ENTRY_PRESENT: + /* + * entry already exists - free toepcb + * and l2t + */ + printf("syncache entry present\n"); + toepcb_release(toep); + break; + case TOE_SC_DROP: + /* + * The syncache has given up on this entry + * either it timed out, or it was evicted + * we need to explicitly release the tid + */ + printf("syncache entry dropped\n"); + toepcb_release(toep); + break; + default: + log(LOG_ERR, "unknown syncache event %d\n", event); + break; + } +} + +static void +syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) +{ + struct in_conninfo inc; + struct tcpopt to; + struct tcphdr th; + struct inpcb *inp; + int mss, wsf, sack, ts; + uint32_t rcv_isn = ntohl(req->rcv_isn); + + bzero(&to, sizeof(struct tcpopt)); + inp = so_sotoinpcb(lso); + + /* + * Fill out information for entering us into the syncache + */ + inc.inc_fport = th.th_sport = req->peer_port; + inc.inc_lport = th.th_dport = req->local_port; + th.th_seq = req->rcv_isn; + th.th_flags = TH_SYN; + + toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; + + + inc.inc_isipv6 = 0; + inc.inc_len = 0; + inc.inc_faddr.s_addr = req->peer_ip; + inc.inc_laddr.s_addr = req->local_ip; + + DPRINTF("syncache add of %d:%d %d:%d\n", + ntohl(req->local_ip), ntohs(req->local_port), + ntohl(req->peer_ip), ntohs(req->peer_port)); + + mss = req->tcp_options.mss; + wsf = req->tcp_options.wsf; + ts = req->tcp_options.tstamp; + sack = req->tcp_options.sack; + to.to_mss = mss; + to.to_wscale = wsf; + to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); + tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); +} + + +/* + * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket + * lock held. Note that the sock here is a listening socket that is not owned + * by the TOE. + */ +static void +process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, + struct listen_ctx *lctx) +{ + int rt_flags; + struct l2t_entry *e; + struct iff_mac tim; + struct mbuf *reply_mbuf, *ddp_mbuf = NULL; + struct cpl_pass_accept_rpl *rpl; + struct cpl_pass_accept_req *req = cplhdr(m); + unsigned int tid = GET_TID(req); + struct tom_data *d = TOM_DATA(tdev); + struct t3cdev *cdev = d->cdev; + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *newtoep; + struct rtentry *dst; + struct sockaddr_in nam; + struct t3c_data *td = T3C_DATA(cdev); + + reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); + if (__predict_false(reply_mbuf == NULL)) { + if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) + t3_defer_reply(m, tdev, reject_pass_request); + else { + cxgb_queue_tid_release(cdev, tid); + m_free(m); + } + DPRINTF("failed to get reply_mbuf\n"); + + goto out; + } + + if (tp->t_state != TCPS_LISTEN) { + DPRINTF("socket not in listen state\n"); + + goto reject; + } + + tim.mac_addr = req->dst_mac; + tim.vlan_tag = ntohs(req->vlan_tag); + if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { + DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); + goto reject; + } + +#ifdef notyet + /* + * XXX do route lookup to confirm that we're still listening on this + * address + */ + if (ip_route_input(skb, req->local_ip, req->peer_ip, + G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) + goto reject; + rt_flags = ((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); + dst_release(skb->dst); // done with the input route, release it + skb->dst = NULL; + + if ((rt_flags & RTF_LOCAL) == 0) + goto reject; +#endif + /* + * XXX + */ + rt_flags = RTF_LOCAL; + if ((rt_flags & RTF_LOCAL) == 0) + goto reject; + + /* + * Calculate values and add to syncache + */ + + newtoep = toepcb_alloc(); + if (newtoep == NULL) + goto reject; + + bzero(&nam, sizeof(struct sockaddr_in)); + + nam.sin_len = sizeof(struct sockaddr_in); + nam.sin_family = AF_INET; + nam.sin_addr.s_addr =req->peer_ip; + dst = rtalloc2((struct sockaddr *)&nam, 1, 0); + + if (dst == NULL) { + printf("failed to find route\n"); + goto reject; + } + e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, + (struct sockaddr *)&nam); + if (e == NULL) { + DPRINTF("failed to get l2t\n"); + } + /* + * Point to our listen socket until accept + */ + newtoep->tp_tp = tp; + newtoep->tp_flags = TP_SYN_RCVD; + newtoep->tp_tid = tid; + newtoep->tp_toedev = tdev; + tp->rcv_wnd = select_rcv_wnd(tdev, so); + + cxgb_insert_tid(cdev, d->client, newtoep, tid); + so_lock(so); + LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); + so_unlock(so); + + newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && + tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; + + if (newtoep->tp_ulp_mode) { + ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); + + if (ddp_mbuf == NULL) + newtoep->tp_ulp_mode = 0; + } + + CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", + TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); + set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); + /* + * XXX workaround for lack of syncache drop + */ + toepcb_hold(newtoep); + syncache_add_accept_req(req, so, newtoep); + + rpl = cplhdr(reply_mbuf); + reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + rpl->wr.wr_lo = 0; + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); + rpl->opt2 = htonl(calc_opt2(so, tdev)); + rpl->rsvd = rpl->opt2; /* workaround for HW bug */ + rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten + + rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | + V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); + rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | + CPL_PASS_OPEN_ACCEPT); + + DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); + + m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); + + l2t_send(cdev, reply_mbuf, e); + m_free(m); + if (newtoep->tp_ulp_mode) { + __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_OFF(1) | + TP_DDP_TIMER_WORKAROUND_MASK, + V_TF_DDP_OFF(1) | + TP_DDP_TIMER_WORKAROUND_VAL, 1); + } else + printf("not offloading\n"); + + + + return; +reject: + if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) + mk_pass_accept_rpl(reply_mbuf, m); + else + mk_tid_release(reply_mbuf, newtoep, tid); + cxgb_ofld_send(cdev, reply_mbuf); + m_free(m); +out: +#if 0 + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); +#else + return; +#endif +} + +/* + * Handle a CPL_PASS_ACCEPT_REQ message. + */ +static int +do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; + struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ + struct tom_data *d = listen_ctx->tom_data; + +#if VALIDATE_TID + struct cpl_pass_accept_req *req = cplhdr(m); + unsigned int tid = GET_TID(req); + struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; + + if (unlikely(!lsk)) { + printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", + cdev->name, + (unsigned long)((union listen_entry *)ctx - + t->stid_tab)); + return CPL_RET_BUF_DONE; + } + if (unlikely(tid >= t->ntids)) { + printk(KERN_ERR "%s: passive open TID %u too large\n", + cdev->name, tid); + return CPL_RET_BUF_DONE; + } + /* + * For T3A the current user of the TID may have closed but its last + * message(s) may have been backlogged so the TID appears to be still + * in use. Just take the TID away, the connection can close at its + * own leisure. For T3B this situation is a bug. + */ + if (!valid_new_tid(t, tid) && + cdev->type != T3A) { + printk(KERN_ERR "%s: passive open uses existing TID %u\n", + cdev->name, tid); + return CPL_RET_BUF_DONE; + } +#endif + + process_pass_accept_req(lso, m, &d->tdev, listen_ctx); + return (0); +} + +/* + * Called when a connection is established to translate the TCP options + * reported by HW to FreeBSD's native format. + */ +static void +assign_rxopt(struct socket *so, unsigned int opt) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); + + inp_lock_assert(tp->t_inpcb); + + toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; + tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; + tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; + tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) + tp->rcv_scale = tp->request_r_scale; +} + +/* + * Completes some final bits of initialization for just established connections + * and changes their state to TCP_ESTABLISHED. + * + * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. + */ +static void +make_established(struct socket *so, u32 snd_isn, unsigned int opt) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; + assign_rxopt(so, opt); + + /* + *XXXXXXXXXXX + * + */ +#ifdef notyet + so->so_proto->pr_ctloutput = t3_ctloutput; +#endif + +#if 0 + inet_sk(sk)->id = tp->write_seq ^ jiffies; +#endif + /* + * XXX not clear what rcv_wup maps to + */ + /* + * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't + * pass through opt0. + */ + if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) + toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); + + dump_toepcb(toep); + +#ifdef notyet +/* + * no clean interface for marking ARP up to date + */ + dst_confirm(sk->sk_dst_cache); +#endif + tp->t_starttime = ticks; + tp->t_state = TCPS_ESTABLISHED; + soisconnected(so); +} + +static int +syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) +{ + + struct in_conninfo inc; + struct tcpopt to; + struct tcphdr th; + int mss, wsf, sack, ts; + struct mbuf *m = NULL; + const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); + unsigned int opt; + +#ifdef MAC +#error "no MAC support" +#endif + + opt = ntohs(req->tcp_opt); + + bzero(&to, sizeof(struct tcpopt)); + + /* + * Fill out information for entering us into the syncache + */ + inc.inc_fport = th.th_sport = req->peer_port; + inc.inc_lport = th.th_dport = req->local_port; + th.th_seq = req->rcv_isn; + th.th_flags = TH_ACK; + + inc.inc_isipv6 = 0; + inc.inc_len = 0; + inc.inc_faddr.s_addr = req->peer_ip; + inc.inc_laddr.s_addr = req->local_ip; + + mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; + wsf = G_TCPOPT_WSCALE_OK(opt); + ts = G_TCPOPT_TSTAMP(opt); + sack = G_TCPOPT_SACK(opt); + + to.to_mss = mss; + to.to_wscale = G_TCPOPT_SND_WSCALE(opt); + to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); + + DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", + ntohl(req->local_ip), ntohs(req->local_port), + ntohl(req->peer_ip), ntohs(req->peer_port), + mss, wsf, ts, sack); + return tcp_offload_syncache_expand(&inc, &to, &th, so, m); +} + + +/* + * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work + * if we are in TCP_SYN_RECV due to crossed SYNs + */ +static int +do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct cpl_pass_establish *req = cplhdr(m); + struct toepcb *toep = (struct toepcb *)ctx; + struct tcpcb *tp = toep->tp_tp; + struct socket *so, *lso; + struct t3c_data *td = T3C_DATA(cdev); + struct sockbuf *snd, *rcv; + + // Complete socket initialization now that we have the SND_ISN + + struct toedev *tdev; + + + tdev = toep->tp_toedev; + + inp_wlock(tp->t_inpcb); + + /* + * + * XXX need to add reference while we're manipulating + */ + so = lso = inp_inpcbtosocket(tp->t_inpcb); + + inp_wunlock(tp->t_inpcb); + + so_lock(so); + LIST_REMOVE(toep, synq_entry); + so_unlock(so); + + if (!syncache_expand_establish_req(req, &so, toep)) { + /* + * No entry + */ + CXGB_UNIMPLEMENTED(); + } + if (so == NULL) { + /* + * Couldn't create the socket + */ + CXGB_UNIMPLEMENTED(); + } + + tp = so_sototcpcb(so); + inp_wlock(tp->t_inpcb); + + snd = so_sockbuf_snd(so); + rcv = so_sockbuf_rcv(so); + + snd->sb_flags |= SB_NOCOALESCE; + rcv->sb_flags |= SB_NOCOALESCE; + + toep->tp_tp = tp; + toep->tp_flags = 0; + tp->t_toe = toep; + reset_wr_list(toep); + tp->rcv_wnd = select_rcv_wnd(tdev, so); + tp->rcv_nxt = toep->tp_copied_seq; + install_offload_ops(so); + + toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); + toep->tp_wr_unacked = 0; + toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); + toep->tp_qset_idx = 0; + toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); + + /* + * XXX Cancel any keep alive timer + */ + + make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); + + /* + * XXX workaround for lack of syncache drop + */ + toepcb_release(toep); + inp_wunlock(tp->t_inpcb); + + CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); + cxgb_log_tcb(cdev->adapter, toep->tp_tid); +#ifdef notyet + /* + * XXX not sure how these checks map to us + */ + if (unlikely(sk->sk_socket)) { // simultaneous opens only + sk->sk_state_change(sk); + sk_wake_async(so, 0, POLL_OUT); + } + /* + * The state for the new connection is now up to date. + * Next check if we should add the connection to the parent's + * accept queue. When the parent closes it resets connections + * on its SYN queue, so check if we are being reset. If so we + * don't need to do anything more, the coming ABORT_RPL will + * destroy this socket. Otherwise move the connection to the + * accept queue. + * + * Note that we reset the synq before closing the server so if + * we are not being reset the stid is still open. + */ + if (unlikely(!tp->forward_skb_hint)) { // removed from synq + __kfree_skb(skb); + goto unlock; + } +#endif + m_free(m); + + return (0); +} + +/* + * Fill in the right TID for CPL messages waiting in the out-of-order queue + * and send them to the TOE. + */ +static void +fixup_and_send_ofo(struct toepcb *toep) +{ + struct mbuf *m; + struct toedev *tdev = toep->tp_toedev; + struct tcpcb *tp = toep->tp_tp; + unsigned int tid = toep->tp_tid; + + log(LOG_NOTICE, "fixup_and_send_ofo\n"); + + inp_lock_assert(tp->t_inpcb); + while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { + /* + * A variety of messages can be waiting but the fields we'll + * be touching are common to all so any message type will do. + */ + struct cpl_close_con_req *p = cplhdr(m); + + p->wr.wr_lo = htonl(V_WR_TID(tid)); + OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); + cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); + } +} + +/* + * Updates socket state from an active establish CPL message. Runs with the + * socket lock held. + */ +static void +socket_act_establish(struct socket *so, struct mbuf *m) +{ + struct cpl_act_establish *req = cplhdr(m); + u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + + if (__predict_false(tp->t_state != TCPS_SYN_SENT)) + log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", + toep->tp_tid, tp->t_state); + + tp->ts_recent_age = ticks; + tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; + toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; + + make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); + + /* + * Now that we finally have a TID send any CPL messages that we had to + * defer for lack of a TID. + */ + if (mbufq_len(&toep->out_of_order_queue)) + fixup_and_send_ofo(toep); + + if (__predict_false(so_state_get(so) & SS_NOFDREF)) { + /* + * XXX does this even make sense? + */ + so_sorwakeup(so); + } + m_free(m); +#ifdef notyet +/* + * XXX assume no write requests permitted while socket connection is + * incomplete + */ + /* + * Currently the send queue must be empty at this point because the + * socket layer does not send anything before a connection is + * established. To be future proof though we handle the possibility + * that there are pending buffers to send (either TX_DATA or + * CLOSE_CON_REQ). First we need to adjust the sequence number of the + * buffers according to the just learned write_seq, and then we send + * them on their way. + */ + fixup_pending_writeq_buffers(sk); + if (t3_push_frames(so, 1)) + sk->sk_write_space(sk); +#endif + + toep->tp_state = tp->t_state; + tcpstat.tcps_connects++; + +} + +/* + * Process a CPL_ACT_ESTABLISH message. + */ +static int +do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct cpl_act_establish *req = cplhdr(m); + unsigned int tid = GET_TID(req); + unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); + struct toepcb *toep = (struct toepcb *)ctx; + struct tcpcb *tp = toep->tp_tp; + struct socket *so; + struct toedev *tdev; + struct tom_data *d; + + if (tp == NULL) { + free_atid(cdev, atid); + return (0); + } + inp_wlock(tp->t_inpcb); + + /* + * XXX + */ + so = inp_inpcbtosocket(tp->t_inpcb); + tdev = toep->tp_toedev; /* blow up here if link was down */ + d = TOM_DATA(tdev); + + /* + * It's OK if the TID is currently in use, the owning socket may have + * backlogged its last CPL message(s). Just take it away. + */ + toep->tp_tid = tid; + toep->tp_tp = tp; + so_insert_tid(d, toep, tid); + free_atid(cdev, atid); + toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); + + socket_act_establish(so, m); + inp_wunlock(tp->t_inpcb); + CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); + cxgb_log_tcb(cdev->adapter, toep->tp_tid); + + return (0); +} + +/* + * Process an acknowledgment of WR completion. Advance snd_una and send the + * next batch of work requests from the write queue. + */ +static void +wr_ack(struct toepcb *toep, struct mbuf *m) +{ + struct tcpcb *tp = toep->tp_tp; + struct cpl_wr_ack *hdr = cplhdr(m); + struct socket *so; + unsigned int credits = ntohs(hdr->credits); + u32 snd_una = ntohl(hdr->snd_una); + int bytes = 0; + struct sockbuf *snd; + + CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); + + inp_wlock(tp->t_inpcb); + so = inp_inpcbtosocket(tp->t_inpcb); + toep->tp_wr_avail += credits; + if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) + toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; + + while (credits) { + struct mbuf *p = peek_wr(toep); + + if (__predict_false(!p)) { + log(LOG_ERR, "%u WR_ACK credits for TID %u with " + "nothing pending, state %u wr_avail=%u\n", + credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); + break; + } + CTR2(KTR_TOM, + "wr_ack: p->credits=%d p->bytes=%d", + p->m_pkthdr.csum_data, p->m_pkthdr.len); + KASSERT(p->m_pkthdr.csum_data != 0, + ("empty request still on list")); + + if (__predict_false(credits < p->m_pkthdr.csum_data)) { + +#if DEBUG_WR > 1 + struct tx_data_wr *w = cplhdr(p); + log(LOG_ERR, + "TID %u got %u WR credits, need %u, len %u, " + "main body %u, frags %u, seq # %u, ACK una %u," + " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", + toep->tp_tid, credits, p->csum, p->len, + p->len - p->data_len, skb_shinfo(p)->nr_frags, + ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), + toep->tp_wr_avail, count_pending_wrs(tp) - credits); +#endif + p->m_pkthdr.csum_data -= credits; + break; + } else { + dequeue_wr(toep); + credits -= p->m_pkthdr.csum_data; + bytes += p->m_pkthdr.len; + CTR3(KTR_TOM, + "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", + p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); + + m_free(p); + } + } + +#if DEBUG_WR + check_wr_invariants(tp); +#endif + + if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { +#if VALIDATE_SEQ + struct tom_data *d = TOM_DATA(TOE_DEV(so)); + + log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " + "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, + toep->tp_tid, tp->snd_una); +#endif + goto out_free; + } + + if (tp->snd_una != snd_una) { + tp->snd_una = snd_una; + tp->ts_recent_age = ticks; +#ifdef notyet + /* + * Keep ARP entry "minty fresh" + */ + dst_confirm(sk->sk_dst_cache); +#endif + if (tp->snd_una == tp->snd_nxt) + toep->tp_flags &= ~TP_TX_WAIT_IDLE; + } + + snd = so_sockbuf_snd(so); + if (bytes) { + CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); + snd = so_sockbuf_snd(so); + sockbuf_lock(snd); + sbdrop_locked(snd, bytes); + so_sowwakeup_locked(so); + } + + if (snd->sb_sndptroff < snd->sb_cc) + t3_push_frames(so, 0); + +out_free: + inp_wunlock(tp->t_inpcb); + m_free(m); +} + +/* + * Handler for TX_DATA_ACK CPL messages. + */ +static int +do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + VALIDATE_SOCK(so); + + wr_ack(toep, m); + return 0; +} + +/* + * Handler for TRACE_PKT CPL messages. Just sink these packets. + */ +static int +do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) +{ + m_freem(m); + return 0; +} + +/* + * Reset a connection that is on a listener's SYN queue or accept queue, + * i.e., one that has not had a struct socket associated with it. + * Must be called from process context. + * + * Modeled after code in inet_csk_listen_stop(). + */ +static void +t3_reset_listen_child(struct socket *child) +{ + struct tcpcb *tp = so_sototcpcb(child); + + t3_send_reset(tp->t_toe); +} + + +static void +t3_child_disconnect(struct socket *so, void *arg) +{ + struct tcpcb *tp = so_sototcpcb(so); + + if (tp->t_flags & TF_TOE) { + inp_wlock(tp->t_inpcb); + t3_reset_listen_child(so); + inp_wunlock(tp->t_inpcb); + } +} + +/* + * Disconnect offloaded established but not yet accepted connections sitting + * on a server's accept_queue. We just send an ABORT_REQ at this point and + * finish off the disconnect later as we may need to wait for the ABORT_RPL. + */ +void +t3_disconnect_acceptq(struct socket *listen_so) +{ + + so_lock(listen_so); + so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); + so_unlock(listen_so); +} + +/* + * Reset offloaded connections sitting on a server's syn queue. As above + * we send ABORT_REQ and finish off when we get ABORT_RPL. + */ + +void +t3_reset_synq(struct listen_ctx *lctx) +{ + struct toepcb *toep; + + so_lock(lctx->lso); + while (!LIST_EMPTY(&lctx->synq_head)) { + toep = LIST_FIRST(&lctx->synq_head); + LIST_REMOVE(toep, synq_entry); + toep->tp_tp = NULL; + t3_send_reset(toep); + cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); + toepcb_release(toep); + } + so_unlock(lctx->lso); +} + + +int +t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, + unsigned int nppods, unsigned int tag, unsigned int maxoff, + unsigned int pg_off, unsigned int color) +{ + unsigned int i, j, pidx; + struct pagepod *p; + struct mbuf *m; + struct ulp_mem_io *req; + unsigned int tid = toep->tp_tid; + const struct tom_data *td = TOM_DATA(toep->tp_toedev); + unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; + + CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", + gl, nppods, tag, maxoff, pg_off, color); + + for (i = 0; i < nppods; ++i) { + m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + req = mtod(m, struct ulp_mem_io *); + m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); + req->wr.wr_lo = 0; + req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | + V_ULPTX_CMD(ULP_MEM_WRITE)); + req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | + V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); + + p = (struct pagepod *)(req + 1); + if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { + p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); + p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | + V_PPOD_COLOR(color)); + p->pp_max_offset = htonl(maxoff); + p->pp_page_offset = htonl(pg_off); + p->pp_rsvd = 0; + for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) + p->pp_addr[j] = pidx < gl->dgl_nelem ? + htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; + } else + p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ + send_or_defer(toep, m, 0); + ppod_addr += PPOD_SIZE; + } + return (0); +} + +/* + * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. + */ +static inline void +mk_cpl_barrier_ulp(struct cpl_barrier *b) +{ + struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; + + txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); + txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); + b->opcode = CPL_BARRIER; +} + +/* + * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. + */ +static inline void +mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) +{ + struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; + + txpkt = (struct ulp_txpkt *)req; + txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); + txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); + req->cpuno = htons(cpuno); +} + +/* + * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. + */ +static inline void +mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, + unsigned int word, uint64_t mask, uint64_t val) +{ + struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; + + CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", + tid, word, mask, val); + + txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); + txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); + req->reply = V_NO_REPLY(1); + req->cpu_idx = 0; + req->word = htons(word); + req->mask = htobe64(mask); + req->val = htobe64(val); +} + +/* + * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. + */ +static void +mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, + unsigned int tid, unsigned int credits) +{ + struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; + + txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); + txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); + OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); + ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | + V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | + V_RX_CREDITS(credits)); +} + +void +t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) +{ + unsigned int wrlen; + struct mbuf *m; + struct work_request_hdr *wr; + struct cpl_barrier *lock; + struct cpl_set_tcb_field *req; + struct cpl_get_tcb *getreq; + struct ddp_state *p = &toep->tp_ddp_state; + +#if 0 + SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); +#endif + wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + + sizeof(*getreq); + m = m_gethdr_nofail(wrlen); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + wr = mtod(m, struct work_request_hdr *); + bzero(wr, wrlen); + + wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); + m->m_pkthdr.len = m->m_len = wrlen; + + lock = (struct cpl_barrier *)(wr + 1); + mk_cpl_barrier_ulp(lock); + + req = (struct cpl_set_tcb_field *)(lock + 1); + + CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); + + /* Hmmm, not sure if this actually a good thing: reactivating + * the other buffer might be an issue if it has been completed + * already. However, that is unlikely, since the fact that the UBUF + * is not completed indicates that there is no oustanding data. + */ + if (bufidx == 0) + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_ACTIVE_BUF(1) | + V_TF_DDP_BUF0_VALID(1), + V_TF_DDP_ACTIVE_BUF(1)); + else + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_ACTIVE_BUF(1) | + V_TF_DDP_BUF1_VALID(1), 0); + + getreq = (struct cpl_get_tcb *)(req + 1); + mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); + + mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); + + /* Keep track of the number of oustanding CPL_GET_TCB requests + */ + p->get_tcb_count++; + +#ifdef T3_TRACE + T3_TRACE1(TIDTB(so), + "t3_cancel_ddpbuf: bufidx %u", bufidx); +#endif + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); +} + +/** + * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one + * @sk: the socket associated with the buffers + * @bufidx: index of HW DDP buffer (0 or 1) + * @tag0: new tag for HW buffer 0 + * @tag1: new tag for HW buffer 1 + * @len: new length for HW buf @bufidx + * + * Sends a compound WR to overlay a new DDP buffer on top of an existing + * buffer by changing the buffer tag and length and setting the valid and + * active flag accordingly. The caller must ensure the new buffer is at + * least as big as the existing one. Since we typically reprogram both HW + * buffers this function sets both tags for convenience. Read the TCB to + * determine how made data was written into the buffer before the overlay + * took place. + */ +void +t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, + unsigned int tag1, unsigned int len) +{ + unsigned int wrlen; + struct mbuf *m; + struct work_request_hdr *wr; + struct cpl_get_tcb *getreq; + struct cpl_set_tcb_field *req; + struct ddp_state *p = &toep->tp_ddp_state; + + CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", + bufidx, tag0, tag1, len); +#if 0 + SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); +#endif + wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); + m = m_gethdr_nofail(wrlen); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + wr = mtod(m, struct work_request_hdr *); + m->m_pkthdr.len = m->m_len = wrlen; + bzero(wr, wrlen); + + + /* Set the ATOMIC flag to make sure that TP processes the following + * CPLs in an atomic manner and no wire segments can be interleaved. + */ + wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); + req = (struct cpl_set_tcb_field *)(wr + 1); + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, + V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | + V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, + V_TCB_RX_DDP_BUF0_TAG(tag0) | + V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); + req++; + if (bufidx == 0) { + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, + V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), + V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); + req++; + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_PUSH_DISABLE_0(1) | + V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), + V_TF_DDP_PUSH_DISABLE_0(0) | + V_TF_DDP_BUF0_VALID(1)); + } else { + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, + V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), + V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); + req++; + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_PUSH_DISABLE_1(1) | + V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), + V_TF_DDP_PUSH_DISABLE_1(0) | + V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); + } + + getreq = (struct cpl_get_tcb *)(req + 1); + mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); + + /* Keep track of the number of oustanding CPL_GET_TCB requests + */ + p->get_tcb_count++; + +#ifdef T3_TRACE + T3_TRACE4(TIDTB(sk), + "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " + "len %d", + bufidx, tag0, tag1, len); +#endif + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); +} + +/* + * Sends a compound WR containing all the CPL messages needed to program the + * two HW DDP buffers, namely optionally setting up the length and offset of + * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. + */ +void +t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, + unsigned int len1, unsigned int offset1, + uint64_t ddp_flags, uint64_t flag_mask, int modulate) +{ + unsigned int wrlen; + struct mbuf *m; + struct work_request_hdr *wr; + struct cpl_set_tcb_field *req; + + CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", + len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); + +#if 0 + SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); +#endif + wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + + (len1 ? sizeof(*req) : 0) + + (modulate ? sizeof(struct cpl_rx_data_ack) : 0); + m = m_gethdr_nofail(wrlen); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); + wr = mtod(m, struct work_request_hdr *); + bzero(wr, wrlen); + + wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); + m->m_pkthdr.len = m->m_len = wrlen; + + req = (struct cpl_set_tcb_field *)(wr + 1); + if (len0) { /* program buffer 0 offset and length */ + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, + V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | + V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), + V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | + V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); + req++; + } + if (len1) { /* program buffer 1 offset and length */ + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, + V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | + V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, + V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | + V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); + req++; + } + + mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, + ddp_flags); + + if (modulate) { + mk_rx_data_ack_ulp(toep, + (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, + toep->tp_copied_seq - toep->tp_rcv_wup); + toep->tp_rcv_wup = toep->tp_copied_seq; + } + +#ifdef T3_TRACE + T3_TRACE5(TIDTB(sk), + "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " + "modulate %d", + len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, + modulate); +#endif + + cxgb_ofld_send(TOEP_T3C_DEV(toep), m); +} + +void +t3_init_wr_tab(unsigned int wr_len) +{ + int i; + + if (mbuf_wrs[1]) /* already initialized */ + return; + + for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { + int sgl_len = (3 * i) / 2 + (i & 1); + + sgl_len += 3; + mbuf_wrs[i] = sgl_len <= wr_len ? + 1 : 1 + (sgl_len - 2) / (wr_len - 1); + } + + wrlen = wr_len * 8; +} + +int +t3_init_cpl_io(void) +{ +#ifdef notyet + tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); + if (!tcphdr_skb) { + log(LOG_ERR, + "Chelsio TCP offload: can't allocate sk_buff\n"); + return -1; + } + skb_put(tcphdr_skb, sizeof(struct tcphdr)); + tcphdr_skb->h.raw = tcphdr_skb->data; + memset(tcphdr_skb->data, 0, tcphdr_skb->len); +#endif + + t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); + t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); + t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); + t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); + t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); + t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); + t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); + t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); + t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); + t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); + t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); + t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); + t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); + t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); + t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); + return (0); +} + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c new file mode 100644 index 0000000000000..77a3d760f54f7 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c @@ -0,0 +1,1030 @@ +/************************************************************************** + +Copyright (c) 2007-2008, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/condvar.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/smp.h> +#include <sys/socket.h> +#include <sys/syslog.h> +#include <sys/uio.h> +#include <sys/file.h> + +#include <machine/bus.h> +#include <machine/cpu.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> + +#include <dev/cxgb/cxgb_config.h> +#include <dev/cxgb/cxgb_osdep.h> +#include <dev/cxgb/sys/mbufq.h> +#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h> +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_offload.h> +#include <net/route.h> + +#include <dev/cxgb/t3cdev.h> +#include <dev/cxgb/common/cxgb_firmware_exports.h> +#include <dev/cxgb/common/cxgb_t3_cpl.h> +#include <dev/cxgb/common/cxgb_tcb.h> +#include <dev/cxgb/common/cxgb_ctl_defs.h> +#include <dev/cxgb/cxgb_offload.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> + +#include <dev/cxgb/sys/mvec.h> +#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> +#include <dev/cxgb/ulp/tom/cxgb_defs.h> +#include <dev/cxgb/ulp/tom/cxgb_tom.h> +#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> +#include <dev/cxgb/ulp/tom/cxgb_tcp.h> +#include <dev/cxgb/ulp/tom/cxgb_vm.h> + + +static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, + struct uio *uio, struct mbuf *top, struct mbuf *control, + int flags, struct thread *td); + +static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr, + struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, + int *flagsp); + +#define TMP_IOV_MAX 16 +#ifndef PG_FRAME +#define PG_FRAME ~PAGE_MASK +#endif +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) + +void +t3_init_socket_ops(void) +{ + struct protosw *prp; + + prp = pffindtype(AF_INET, SOCK_STREAM); + pru_sosend = prp->pr_usrreqs->pru_sosend; + pru_soreceive = prp->pr_usrreqs->pru_soreceive; +} + +struct cxgb_dma_info { + size_t cdi_mapped; + int cdi_nsegs; + bus_dma_segment_t *cdi_segs; + +}; + +static void +cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs, + bus_size_t mapsize, int error) +{ + struct cxgb_dma_info *cdi = arg; + + cdi->cdi_mapped = mapsize; + cdi->cdi_nsegs = nsegs; + cdi->cdi_segs = segs; +} + +static void +iov_adj(struct iovec **iov, int *iovcnt, size_t count) +{ + struct iovec *iovtmp; + int iovcnttmp; + caddr_t ptmp; + + if (count > 0) { + iovtmp = *iov; + iovcnttmp = *iovcnt; + while (count > 0) { + if (count < iovtmp->iov_len) { + ptmp = iovtmp->iov_base; + ptmp += count; + iovtmp->iov_base = ptmp; + iovtmp->iov_len -= count; + break; + } else + count -= iovtmp->iov_len; + iovtmp++; + iovcnttmp--; + } + *iov = iovtmp; + *iovcnt = iovcnttmp; + } else if (count < 0) { + iovtmp = &(*iov)[*iovcnt - 1]; + iovcnttmp = *iovcnt; + while (count < 0) { + if (-count < iovtmp->iov_len) { + iovtmp->iov_len += count; + break; + } else + count += iovtmp->iov_len; + iovtmp--; + iovcnttmp--; + } + *iovcnt = iovcnttmp; + } +} + +static void +cxgb_zero_copy_free(void *cl, void *arg) +{ + struct mbuf_vec *mv; + struct mbuf *m = (struct mbuf *)cl; + + mv = mtomv(m); + /* + * Physical addresses, don't try to free should be unheld separately from sbdrop + * + */ + mv->mv_count = 0; + m_free_iovec(m, m->m_type); +} + + +static int +cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags) +{ + struct iovec *iov = uio->uio_iov; + int iovcnt = uio->uio_iovcnt; + int err, i, count, totcount, maxcount, totbytes, npages, curbytes; + uint64_t start, end; + vm_page_t *mp; + + totbytes = totcount = 0; + maxcount = *held; + + mp = m; + for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) { + count = maxcount - totcount; + + start = (uintptr_t)iov->iov_base; + end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len); + start &= PG_FRAME; + end += PAGE_MASK; + end &= PG_FRAME; + npages = (end - start) >> PAGE_SHIFT; + + count = min(count, npages); + + err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags); + if (err) { + vm_fault_unhold_pages(m, totcount); + return (err); + } + mp += count; + totcount += count; + curbytes = iov->iov_len; + if (count != npages) + curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK); + totbytes += curbytes; + } + uio->uio_resid -= totbytes; + + return (0); +} + +/* + * Returns whether a connection should enable DDP. This happens when all of + * the following conditions are met: + * - the connection's ULP mode is DDP + * - DDP is not already enabled + * - the last receive was above the DDP threshold + * - receive buffers are in user space + * - receive side isn't shutdown (handled by caller) + * - the connection's receive window is big enough so that sizable buffers + * can be posted without closing the window in the middle of DDP (checked + * when the connection is offloaded) + */ +static int +so_should_ddp(const struct toepcb *toep, int last_recv_len) +{ + + DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n", + toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres), + toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN)); + + return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) && + last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) && + toep->tp_tp->rcv_wnd > + (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN); +} + +static inline int +is_ddp(const struct mbuf *m) +{ + return ((m->m_flags & M_DDP) != 0); +} + +static inline int +is_ddp_psh(const struct mbuf *m) +{ + return ((is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH)) != 0); +} + +static int +m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio) +{ + int curlen, startlen, resid_init, err = 0; + caddr_t buf; + + DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n", + m, offset, len); + + startlen = len; + resid_init = uio->uio_resid; + while (m && len) { + buf = mtod(m, caddr_t); + curlen = m->m_len; + if (offset && (offset < curlen)) { + curlen -= offset; + buf += offset; + offset = 0; + } else if (offset) { + offset -= curlen; + m = m->m_next; + continue; + } + err = uiomove(buf, min(len, curlen), uio); + if (err) { + printf("uiomove returned %d\n", err); + return (err); + } + + len -= min(len, curlen); + m = m->m_next; + } + DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n", + startlen - len, resid_init, uio->uio_resid); + return (err); +} + +/* + * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the + * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a + * DDP buffer. + */ +static inline int +copy_data(const struct mbuf *m, int offset, int len, struct uio *uio) +{ + struct iovec *to = uio->uio_iov; + int err; + + if (__predict_true(!is_ddp(m))) /* RX_DATA */ + return m_uiomove(m, offset, len, uio); + if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */ + to->iov_len -= len; + to->iov_base = ((caddr_t)to->iov_base) + len; + uio->uio_iov = to; + uio->uio_resid -= len; + return (0); + } + err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */ + return (err); +} + +static void +cxgb_wait_dma_completion(struct toepcb *toep) +{ + struct rwlock *lock; + + lock = &toep->tp_tp->t_inpcb->inp_lock; + inp_wlock(toep->tp_tp->t_inpcb); + cv_wait_unlock(&toep->tp_cv, lock); +} + +static int +cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m) +{ + int i, seg_count, err, type; + struct mbuf *m0; + struct cxgb_dma_info cdi; + struct mbuf_vec *mv; + struct mbuf_iovec *mi; + bus_dma_segment_t *segs; + + err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio, + cxgb_dma_callback, &cdi, 0); + + if (err) + return (err); + seg_count = cdi.cdi_nsegs; + if ((m0 = mcl_alloc(seg_count, &type)) == NULL) { + bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap); + return (ENOMEM); + } + segs = cdi.cdi_segs; + m0->m_type = type; + m0->m_flags = (M_EXT|M_NOFREE); + m0->m_ext.ext_type = EXT_EXTREF; + m0->m_ext.ext_free = cxgb_zero_copy_free; + m0->m_ext.ext_arg1 = NULL; /* XXX: probably wrong /phk */ + m0->m_ext.ext_arg2 = NULL; + + mv = mtomv(m0); + mv->mv_count = seg_count; + mv->mv_first = 0; + for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++) + mi_collapse_sge(mi, segs); + + *m = m0; + + /* + * This appears to be a no-op at the moment + * as busdma is all or nothing need to make + * sure the tag values are large enough + * + */ + if (cdi.cdi_mapped < uio->uio_resid) { + uio->uio_resid -= cdi.cdi_mapped; + } else + uio->uio_resid = 0; + + return (0); +} + +static int +t3_sosend(struct socket *so, struct uio *uio) +{ + int rv, count, hold_resid, sent, iovcnt; + struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct mbuf *m; + struct uio uiotmp; + struct sockbuf *snd; + + /* + * Events requiring iteration: + * - number of pages exceeds max hold pages for process or system + * - number of pages exceeds maximum sg entries for a single WR + * + * We're limited to holding 128 pages at once - and we're limited to + * 34 SG entries per work request, but each SG entry can be any number + * of contiguous pages + * + */ + + uiotmp = *uio; + iovcnt = uio->uio_iovcnt; + iov = uio->uio_iov; + sent = 0; + snd = so_sockbuf_snd(so); +sendmore: + /* + * Make sure we don't exceed the socket buffer + */ + count = min(toep->tp_page_count, (sockbuf_sbspace(snd) >> PAGE_SHIFT) + 2*PAGE_SIZE); + rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0); + hold_resid = uiotmp.uio_resid; + if (rv) + return (rv); + + /* + * Bump past sent and shave off the unheld amount + */ + if (hold_resid > 0) { + iovtmpp = iovtmp; + memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); + if (sent) + iov_adj(&iovtmpp, &iovcnt, sent); + iov_adj(&iovtmpp, &iovcnt, -hold_resid); + uiotmp.uio_iov = iovtmpp; + uiotmp.uio_iovcnt = iovcnt; + + } + uiotmp.uio_resid = uio->uio_resid - hold_resid; + + /* + * Push off all held pages + * + */ + while (uiotmp.uio_resid > 0) { + rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m); + if (rv) { + vm_fault_unhold_pages(toep->tp_pages, count); + return (rv); + } + uio->uio_resid -= m->m_pkthdr.len; + sent += m->m_pkthdr.len; + sbappend(snd, m); + t3_push_frames(so, TRUE); + iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid); + } + + /* + * Wait for pending I/O to be DMA'd to the card + * + */ + cxgb_wait_dma_completion(toep); + vm_fault_unhold_pages(toep->tp_pages, count); + /* + * If there is more data to send adjust local copy of iov + * to point to teh start + */ + if (hold_resid) { + iovtmpp = iovtmp; + memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); + iov_adj(&iovtmpp, &iovcnt, sent); + uiotmp = *uio; + uiotmp.uio_iov = iovtmpp; + uiotmp.uio_iovcnt = iovcnt; + goto sendmore; + } + + return (0); +} + +static int +cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *top, struct mbuf *control, int flags, struct thread *td) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct toedev *tdev; + int zcopy_thres, zcopy_enabled, rv; + + /* + * In order to use DMA direct from userspace the following + * conditions must be met: + * - the connection is currently offloaded + * - ddp is enabled + * - the number of bytes to be transferred exceeds the threshold + * - the number of bytes currently in flight won't exceed the in-flight + * threshold XXX TODO + * - vm_fault_hold_user_pages succeeds + * - blocking socket XXX for now + * + */ + if (tp && tp->t_flags & TF_TOE) { + struct toepcb *toep = tp->t_toe; + + tdev = toep->tp_toedev; + zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres); + zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled); + + if (uio && (uio->uio_resid > zcopy_thres) && + (uio->uio_iovcnt < TMP_IOV_MAX) && ((so_state_get(so) & SS_NBIO) == 0) + && zcopy_enabled) { + rv = t3_sosend(so, uio); + if (rv != EAGAIN) + return (rv); + } + } + return pru_sosend(so, addr, uio, top, control, flags, td); +} + +/* + * Following replacement or removal of the first mbuf on the first mbuf chain + * of a socket buffer, push necessary state changes back into the socket + * buffer so that other consumers see the values consistently. 'nextrecord' + * is the callers locally stored value of the original value of + * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. + * NOTE: 'nextrecord' may be NULL. + */ +static __inline void +sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) +{ + sockbuf_lock_assert(sb); + /* + * First, update for the new value of nextrecord. If necessary, make + * it the first record. + */ + if (sb->sb_mb != NULL) + sb->sb_mb->m_nextpkt = nextrecord; + else + sb->sb_mb = nextrecord; + + /* + * Now update any dependent socket buffer fields to reflect the new + * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the + * addition of a second clause that takes care of the case where + * sb_mb has been updated, but remains the last record. + */ + if (sb->sb_mb == NULL) { + sb->sb_mbtail = NULL; + sb->sb_lastrecord = NULL; + } else if (sb->sb_mb->m_nextpkt == NULL) + sb->sb_lastrecord = sb->sb_mb; +} + +#define IS_NONBLOCKING(so) (so_state_get(so) & SS_NBIO) + +static int +t3_soreceive(struct socket *so, int *flagsp, struct uio *uio) +{ + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct mbuf *m; + uint32_t offset; + int err, flags, avail, len, copied, copied_unacked; + int target; /* Read at least this many bytes */ + int user_ddp_ok; + struct ddp_state *p; + struct inpcb *inp = so_sotoinpcb(so); + int socket_state, socket_error; + struct sockbuf *rcv; + + avail = offset = copied = copied_unacked = 0; + flags = flagsp ? (*flagsp &~ MSG_EOR) : 0; + rcv = so_sockbuf_rcv(so); + + err = sblock(rcv, SBLOCKWAIT(flags)); + p = &toep->tp_ddp_state; + + if (err) + return (err); + + rcv = so_sockbuf_rcv(so); + sockbuf_lock(rcv); + if ((tp->t_flags & TF_TOE) == 0) { + sockbuf_unlock(rcv); + err = EAGAIN; + goto done_unlocked; + } + + p->user_ddp_pending = 0; +restart: + if ((tp->t_flags & TF_TOE) == 0) { + sockbuf_unlock(rcv); + err = EAGAIN; + goto done_unlocked; + } + + len = uio->uio_resid; + m = rcv->sb_mb; + target = (flags & MSG_WAITALL) ? len : rcv->sb_lowat; + user_ddp_ok = p->ubuf_ddp_ready; + p->cancel_ubuf = 0; + + if (len == 0) + goto done; + if (m) + goto got_mbuf; + + /* empty receive queue */ + if (copied >= target && (rcv->sb_mb == NULL) && + !p->user_ddp_pending) + goto done; + + socket_state = so_state_get(so); + socket_error = so_error_get(so); + rcv = so_sockbuf_rcv(so); + + if (copied) { + if (socket_error || tp->t_state == TCPS_CLOSED || + (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))) + goto done; + } else { + if (socket_state & SS_NOFDREF) + goto done; + if (socket_error) { + err = socket_error; + socket_error = 0; + goto done; + } + if (rcv->sb_state & SBS_CANTRCVMORE) + goto done; + if (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) + goto done; + if (tp->t_state == TCPS_CLOSED) { + err = ENOTCONN; + goto done; + } + } + if (rcv->sb_mb && !p->user_ddp_pending) { + sockbuf_unlock(rcv); + inp_wlock(inp); + t3_cleanup_rbuf(tp, copied_unacked); + inp_wunlock(inp); + sockbuf_lock(rcv); + copied_unacked = 0; + goto restart; + } + if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending && + uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && + p->ubuf_ddp_ready) { + p->user_ddp_pending = + !t3_overlay_ubuf(toep, rcv, uio, + IS_NONBLOCKING(so), flags, 1, 1); + if (p->user_ddp_pending) { + p->kbuf_posted++; + user_ddp_ok = 0; + } + } + if (p->kbuf[0] && (p->kbuf_posted == 0)) { + t3_post_kbuf(toep, 1, IS_NONBLOCKING(so)); + p->kbuf_posted++; + } + if (p->user_ddp_pending) { + /* One shot at DDP if we already have enough data */ + if (copied >= target) + user_ddp_ok = 0; + + if (rcv->sb_state & SBS_CANTRCVMORE) + goto done; + CTR0(KTR_TOM, "ddp pending -- waiting"); + if ((err = sbwait(rcv)) != 0) + goto done; +//for timers to work await_ddp_completion(sk, flags, &timeo); + } else if (copied >= target) + goto done; + else { + if (copied_unacked) { + int i = 0; + + sockbuf_unlock(rcv); + inp_wlock(inp); + t3_cleanup_rbuf(tp, copied_unacked); + inp_wunlock(inp); + copied_unacked = 0; + if (mp_ncpus > 1) + while (i++ < 200 && rcv->sb_mb == NULL) + cpu_spinwait(); + sockbuf_lock(rcv); + } + if (rcv->sb_mb) + goto restart; + + if (rcv->sb_state & SBS_CANTRCVMORE) + goto done; + + CTR0(KTR_TOM, "no buffers -- waiting"); + + if ((err = sbwait(rcv)) != 0) + goto done; + } + goto restart; +got_mbuf: + /* + * Adjust the mbuf seqno if it has already been partially processed by + * soreceive_generic + */ + if (m->m_pkthdr.len != m->m_len) { + m->m_seq += m->m_pkthdr.len - m->m_len; + m->m_pkthdr.len = m->m_len; + } + + CTR6(KTR_TOM, "t3_soreceive: ddp_flags=0x%x m_len=%u resid=%u " + "m_seq=0x%08x c_seq=0x%08x c_unack=%u", + (is_ddp(m) ? m->m_ddp_flags : 0), m->m_pkthdr.len, len, + m->m_seq, toep->tp_copied_seq, copied_unacked); + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), + ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT), + m->m_ext.ext_type, m->m_len, m->m_pkthdr.len)); + KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p" + " m_flags=0x%x m->m_len=%d", m->m_next, m->m_nextpkt, m->m_flags, m->m_len)); + if (m->m_pkthdr.len == 0) { + if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0) + panic("empty mbuf and NOCOPY not set\n"); + CTR0(KTR_TOM, "ddp done notification"); + p->user_ddp_pending = 0; + sbdroprecord_locked(rcv); + goto done; + } + + KASSERT((int32_t)(toep->tp_copied_seq + copied_unacked - m->m_seq) >= 0, + ("offset will go negative: offset=%d copied_seq=0x%08x copied_unacked=%d m_seq=0x%08x", + offset, toep->tp_copied_seq, copied_unacked, m->m_seq)); + offset = toep->tp_copied_seq + copied_unacked - m->m_seq; + + if (offset >= m->m_pkthdr.len) + panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x " + "seq 0x%x pktlen %d ddp flags 0x%x", offset, + toep->tp_copied_seq + copied_unacked, m->m_seq, + m->m_pkthdr.len, m->m_ddp_flags); + + avail = m->m_pkthdr.len - offset; + if (len < avail) { + if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY)) + panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset); + avail = len; + rcv->sb_flags |= SB_IN_TOE; + } else if (p->kbuf_posted == 0 && p->user_ddp_pending == 0) + rcv->sb_flags &= ~SB_IN_TOE; + +#ifdef URGENT_DATA_SUPPORTED + /* + * Check if the data we are preparing to copy contains urgent + * data. Either stop short of urgent data or skip it if it's + * first and we are not delivering urgent data inline. + */ + if (__predict_false(toep->tp_urg_data)) { + uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked; + + if (urg_offset < avail) { + if (urg_offset) { + /* stop short of the urgent data */ + avail = urg_offset; + } else if ((so_options_get(so) & SO_OOBINLINE) == 0) { + /* First byte is urgent, skip */ + toep->tp_copied_seq++; + offset++; + avail--; + if (!avail) + goto skip_copy; + } + } + } +#endif + if (is_ddp_psh(m) || offset || (rcv->sb_mb && !is_ddp(m))) { + user_ddp_ok = 0; +#ifdef T3_TRACE + T3_TRACE0(TIDTB(so), "t3_sosend: PSH"); +#endif + } + + if (user_ddp_ok && !p->user_ddp_pending && + uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && + p->ubuf_ddp_ready) { + p->user_ddp_pending = + !t3_overlay_ubuf(toep, rcv, uio, + IS_NONBLOCKING(so), flags, 1, 1); + if (p->user_ddp_pending) { + p->kbuf_posted++; + user_ddp_ok = 0; + } + DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending); + } else + DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n", + user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0, + p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted); + + /* + * If MSG_TRUNC is specified the data is discarded. + * XXX need to check pr_atomic + */ + KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset)); + if (__predict_true(!(flags & MSG_TRUNC))) { + int resid = uio->uio_resid; + + sockbuf_unlock(rcv); + if ((err = copy_data(m, offset, avail, uio))) { + if (err) + err = EFAULT; + goto done_unlocked; + } + + sockbuf_lock(rcv); + if (avail != (resid - uio->uio_resid)) + printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n", + avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m)); + + if ((tp->t_flags & TF_TOE) == 0) { + sockbuf_unlock(rcv); + err = EAGAIN; + goto done_unlocked; + } + } + + copied += avail; + copied_unacked += avail; + len -= avail; + +#ifdef URGENT_DATA_SUPPORTED +skip_copy: + if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq)) + tp->urg_data = 0; +#endif + /* + * If the buffer is fully consumed free it. If it's a DDP + * buffer also handle any events it indicates. + */ + if (avail + offset >= m->m_pkthdr.len) { + unsigned int fl = m->m_ddp_flags; + int exitnow, got_psh = 0, nomoredata = 0; + int count; + struct mbuf *nextrecord; + + if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) { + if (is_ddp_psh(m) && p->user_ddp_pending) + got_psh = 1; + + if (fl & DDP_BF_NOCOPY) + p->user_ddp_pending = 0; + else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) { + p->kbuf_posted--; + nomoredata = 1; + } else { + p->kbuf_posted--; + p->ubuf_ddp_ready = 1; + } + } + + nextrecord = m->m_nextpkt; + count = m->m_pkthdr.len; + while (count > 0) { + count -= m->m_len; + KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); + CTR2(KTR_TOM, "freeing mbuf m_len = %d pktlen = %d", m->m_len, m->m_pkthdr.len); + sbfree(rcv, m); + rcv->sb_mb = m_free(m); + m = rcv->sb_mb; + } + sockbuf_pushsync(rcv, nextrecord); +#if 0 + sbdrop_locked(rcv, m->m_pkthdr.len); +#endif + exitnow = got_psh || nomoredata; + if (copied >= target && (rcv->sb_mb == NULL) && exitnow) + goto done; + if (copied_unacked > (rcv->sb_hiwat >> 2)) { + sockbuf_unlock(rcv); + inp_wlock(inp); + t3_cleanup_rbuf(tp, copied_unacked); + inp_wunlock(inp); + copied_unacked = 0; + sockbuf_lock(rcv); + } + } + if (len > 0) + goto restart; + + done: + if ((tp->t_flags & TF_TOE) == 0) { + sockbuf_unlock(rcv); + err = EAGAIN; + goto done_unlocked; + } + /* + * If we can still receive decide what to do in preparation for the + * next receive. Note that RCV_SHUTDOWN is set if the connection + * transitioned to CLOSE but not if it was in that state to begin with. + */ + if (__predict_true((so_state_get(so) & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) { + if (p->user_ddp_pending) { + user_ddp_ok = 0; + t3_cancel_ubuf(toep, rcv); + if (rcv->sb_mb) { + if (copied < 0) + copied = 0; + if (len > 0) + goto restart; + } + p->user_ddp_pending = 0; + } + if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) { +#ifdef T3_TRACE + T3_TRACE0(TIDTB(so), + "chelsio_recvmsg: about to exit, repost kbuf"); +#endif + + t3_post_kbuf(toep, 1, IS_NONBLOCKING(so)); + p->kbuf_posted++; + } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) { + CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid); + if (!t3_enter_ddp(toep, TOM_TUNABLE(toep->tp_toedev, + ddp_copy_limit), 0, IS_NONBLOCKING(so))) { + rcv->sb_flags |= SB_IN_TOE; + p->kbuf_posted = 1; + } + + } + } +#ifdef T3_TRACE + T3_TRACE5(TIDTB(so), + "chelsio_recvmsg <-: copied %d len %d buffers_freed %d " + "kbuf_posted %d user_ddp_pending %u", + copied, len, buffers_freed, p ? p->kbuf_posted : -1, + p->user_ddp_pending); +#endif + sockbuf_unlock(rcv); +done_unlocked: + if (copied_unacked && (tp->t_flags & TF_TOE)) { + inp_wlock(inp); + t3_cleanup_rbuf(tp, copied_unacked); + inp_wunlock(inp); + } + sbunlock(rcv); + + return (err); +} + +static int +cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + struct toedev *tdev; + int rv, zcopy_thres, zcopy_enabled, flags; + struct tcpcb *tp = so_sototcpcb(so); + struct sockbuf *rcv = so_sockbuf_rcv(so); + + flags = flagsp ? *flagsp &~ MSG_EOR : 0; + + /* + * In order to use DMA direct from userspace the following + * conditions must be met: + * - the connection is currently offloaded + * - ddp is enabled + * - the number of bytes to be transferred exceeds the threshold + * - the number of bytes currently in flight won't exceed the in-flight + * threshold XXX TODO + * - vm_fault_hold_user_pages succeeds + * - blocking socket XXX for now + * - iovcnt is 1 + * + */ + if (tp && (tp->t_flags & TF_TOE) && uio && ((flags & (MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0) + && (uio->uio_iovcnt == 1) && (mp0 == NULL) && + ((rcv->sb_flags & SB_IN_TOE) || (uio->uio_iovcnt == 1))) { + struct toepcb *toep = tp->t_toe; + + tdev = toep->tp_toedev; + zcopy_thres = TOM_TUNABLE(tdev, ddp_thres); + zcopy_enabled = TOM_TUNABLE(tdev, ddp); + if ((rcv->sb_flags & SB_IN_TOE) ||((uio->uio_resid > zcopy_thres) && + (uio->uio_iovcnt == 1) && zcopy_enabled)) { + CTR4(KTR_TOM, "cxgb_soreceive: sb_flags=0x%x t_flags=0x%x flags=0x%x uio_resid=%d", + rcv->sb_flags, tp->t_flags, flags, uio->uio_resid); + rv = t3_soreceive(so, flagsp, uio); + if (rv != EAGAIN) + return (rv); + else + printf("returned EAGAIN\n"); + } + } else if (tp && (tp->t_flags & TF_TOE) && uio && mp0 == NULL) { + struct sockbuf *rcv = so_sockbuf_rcv(so); + + log(LOG_INFO, "skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n", + flags, uio->uio_iovcnt, rcv->sb_state); + } + + return pru_soreceive(so, psa, uio, mp0, controlp, flagsp); +} + +struct protosw cxgb_protosw; +struct pr_usrreqs cxgb_tcp_usrreqs; + + +void +t3_install_socket_ops(struct socket *so) +{ + static int copied = 0; + struct pr_usrreqs *pru; + struct protosw *psw; + + if (copied == 0) { + psw = so_protosw_get(so); + pru = psw->pr_usrreqs; + + bcopy(psw, &cxgb_protosw, sizeof(*psw)); + bcopy(pru, &cxgb_tcp_usrreqs, sizeof(*pru)); + + cxgb_protosw.pr_ctloutput = t3_ctloutput; + cxgb_protosw.pr_usrreqs = &cxgb_tcp_usrreqs; + cxgb_tcp_usrreqs.pru_sosend = cxgb_sosend; + cxgb_tcp_usrreqs.pru_soreceive = cxgb_soreceive; + } + so_protosw_set(so, &cxgb_protosw); + +#if 0 + so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend; + so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive; +#endif +} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_ddp.c b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c new file mode 100644 index 0000000000000..86e1e91b98271 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_ddp.c @@ -0,0 +1,738 @@ +/************************************************************************** + +Copyright (c) 2007-2008, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/ktr.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/condvar.h> +#include <sys/mutex.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/syslog.h> +#include <sys/uio.h> + +#include <machine/bus.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> + + +#include <dev/cxgb/cxgb_osdep.h> +#include <dev/cxgb/sys/mbufq.h> + +#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h> +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_offload.h> +#include <net/route.h> + +#include <dev/cxgb/t3cdev.h> +#include <dev/cxgb/common/cxgb_firmware_exports.h> +#include <dev/cxgb/common/cxgb_t3_cpl.h> +#include <dev/cxgb/common/cxgb_tcb.h> +#include <dev/cxgb/common/cxgb_ctl_defs.h> +#include <dev/cxgb/cxgb_offload.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> + +#include <dev/cxgb/sys/mvec.h> +#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> +#include <dev/cxgb/ulp/tom/cxgb_defs.h> +#include <dev/cxgb/ulp/tom/cxgb_tom.h> +#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> +#include <dev/cxgb/ulp/tom/cxgb_tcp.h> +#include <dev/cxgb/ulp/tom/cxgb_vm.h> + + +#define MAX_SCHEDULE_TIMEOUT 300 + +/* + * Return the # of page pods needed to accommodate a # of pages. + */ +static inline unsigned int +pages2ppods(unsigned int pages) +{ + return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS; +} + +/** + * t3_pin_pages - pin a user memory range and prepare it for DDP + * @addr - the starting address + * @len - the length of the range + * @newgl - contains the pages and physical addresses of the pinned range + * @gl - an existing gather list, may be %NULL + * + * Pins the pages in the user-space memory range [addr, addr + len) and + * maps them for DMA. Returns a gather list with the pinned pages and + * their physical addresses. If @gl is non NULL the pages it describes + * are compared against the pages for [addr, addr + len), and if the + * existing gather list already covers the range a new list is not + * allocated. Returns 0 on success, or a negative errno. On success if + * a new gather list was allocated it is returned in @newgl. + */ +static int +t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, vm_offset_t addr, + size_t len, struct ddp_gather_list **newgl, + const struct ddp_gather_list *gl) +{ + int i = 0, err; + size_t pg_off; + unsigned int npages; + struct ddp_gather_list *p; + + /* + * XXX need x86 agnostic check + */ + if (addr + len > VM_MAXUSER_ADDRESS) + return (EFAULT); + + pg_off = addr & PAGE_MASK; + npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *), + M_DEVBUF, M_NOWAIT|M_ZERO); + if (p == NULL) + return (ENOMEM); + + err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE); + if (err) + goto free_gl; + + if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages && + gl->dgl_length >= len) { + for (i = 0; i < npages; i++) + if (p->dgl_pages[i] != gl->dgl_pages[i]) + goto different_gl; + err = 0; + goto unpin; + } + +different_gl: + p->dgl_length = len; + p->dgl_offset = pg_off; + p->dgl_nelem = npages; +#ifdef NEED_BUSDMA + p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off, + PAGE_SIZE - pg_off, + PCI_DMA_FROMDEVICE) - pg_off; + for (i = 1; i < npages; ++i) + p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); +#endif + *newgl = p; + return (0); +unpin: + vm_fault_unhold_pages(p->dgl_pages, npages); + +free_gl: + + free(p, M_DEVBUF); + *newgl = NULL; + return (err); +} + +static void +unmap_ddp_gl(const struct ddp_gather_list *gl) +{ +#ifdef NEED_BUSDMA + int i; + + if (!gl->nelem) + return; + + pci_unmap_page(pdev, gl->phys_addr[0] + gl->offset, + PAGE_SIZE - gl->offset, PCI_DMA_FROMDEVICE); + for (i = 1; i < gl->nelem; ++i) + pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE, + PCI_DMA_FROMDEVICE); + +#endif +} + +static void +ddp_gl_free_pages(struct ddp_gather_list *gl, int dirty) +{ + /* + * XXX mark pages as dirty before unholding + */ + vm_fault_unhold_pages(gl->dgl_pages, gl->dgl_nelem); +} + +void +t3_free_ddp_gl(struct ddp_gather_list *gl) +{ + unmap_ddp_gl(gl); + ddp_gl_free_pages(gl, 0); + free(gl, M_DEVBUF); +} + +/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */ +#define MAX_PPODS 64U + +/* + * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in + * the TCB. We allocate page pods in multiples of PPOD_CLUSTER_SIZE. First we + * try to allocate enough page pods to accommodate the whole buffer, subject to + * the MAX_PPODS limit. If that fails we try to allocate PPOD_CLUSTER_SIZE page + * pods before failing entirely. + */ +static int +alloc_buf1_ppods(struct toepcb *toep, struct ddp_state *p, + unsigned long addr, unsigned int len) +{ + int err, tag, npages, nppods; + struct tom_data *d = TOM_DATA(toep->tp_toedev); + +#if 0 + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +#endif + npages = ((addr & PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + nppods = min(pages2ppods(npages), MAX_PPODS); + nppods = roundup2(nppods, PPOD_CLUSTER_SIZE); + err = t3_alloc_ppods(d, nppods, &tag); + if (err && nppods > PPOD_CLUSTER_SIZE) { + nppods = PPOD_CLUSTER_SIZE; + err = t3_alloc_ppods(d, nppods, &tag); + } + if (err) + return (ENOMEM); + + p->ubuf_nppods = nppods; + p->ubuf_tag = tag; +#if NUM_DDP_KBUF == 1 + t3_set_ddp_tag(toep, 1, tag << 6); +#endif + return (0); +} + +/* + * Starting offset for the user DDP buffer. A non-0 value ensures a DDP flush + * won't block indefinitely if there's nothing to place (which should be rare). + */ +#define UBUF_OFFSET 1 + +static __inline unsigned long +select_ddp_flags(const struct toepcb *toep, int buf_idx, + int nonblock, int rcv_flags) +{ + if (buf_idx == 1) { + if (__predict_false(rcv_flags & MSG_WAITALL)) + return V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PSH_NO_INVALIDATE1(1) | + V_TF_DDP_PUSH_DISABLE_1(1); + if (nonblock) + return V_TF_DDP_BUF1_FLUSH(1); + + return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(toep->tp_toedev, + ddp_push_wait)); + } + + if (__predict_false(rcv_flags & MSG_WAITALL)) + return V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PSH_NO_INVALIDATE1(1) | + V_TF_DDP_PUSH_DISABLE_0(1); + if (nonblock) + return V_TF_DDP_BUF0_FLUSH(1); + + return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(toep->tp_toedev, ddp_push_wait)); +} + +/* + * Reposts the kernel DDP buffer after it has been previously become full and + * invalidated. We just need to reset the offset and adjust the DDP flags. + * Conveniently, we can set the flags and the offset with a single message. + * Note that this function does not set the buffer length. Again conveniently + * our kernel buffer is of fixed size. If the length needs to be changed it + * needs to be done separately. + */ +static void +t3_repost_kbuf(struct toepcb *toep, unsigned int bufidx, int modulate, + int activate, int nonblock) +{ + struct ddp_state *p = &toep->tp_ddp_state; + unsigned long flags; + +#if 0 + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +#endif + p->buf_state[bufidx].cur_offset = p->kbuf[bufidx]->dgl_offset; + p->buf_state[bufidx].flags = p->kbuf_noinval ? DDP_BF_NOINVAL : 0; + p->buf_state[bufidx].gl = p->kbuf[bufidx]; + p->cur_buf = bufidx; + p->kbuf_idx = bufidx; + + flags = select_ddp_flags(toep, bufidx, nonblock, 0); + if (!bufidx) + t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags | + V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) | + V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) | + V_TF_DDP_BUF0_VALID(1), + V_TF_DDP_BUF0_FLUSH(1) | + V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) | + V_TF_DDP_BUF0_VALID(1) | + V_TF_DDP_ACTIVE_BUF(activate), modulate); + else + t3_setup_ddpbufs(toep, 0, 0, 0, 0, flags | + V_TF_DDP_PSH_NO_INVALIDATE0(p->kbuf_noinval) | + V_TF_DDP_PSH_NO_INVALIDATE1(p->kbuf_noinval) | + V_TF_DDP_BUF1_VALID(1) | + V_TF_DDP_ACTIVE_BUF(activate), + V_TF_DDP_BUF1_FLUSH(1) | + V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PSH_NO_INVALIDATE1(1) | V_TF_DDP_OFF(1) | + V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), + modulate); + +} + +/** + * setup_uio_ppods - setup HW page pods for a user iovec + * @sk: the associated socket + * @uio: the uio + * @oft: additional bytes to map before the start of the buffer + * + * Pins a user iovec and sets up HW page pods for DDP into it. We allocate + * page pods for user buffers on the first call per socket. Afterwards we + * limit the buffer length to whatever the existing page pods can accommodate. + * Returns a negative error code or the length of the mapped buffer. + * + * The current implementation handles iovecs with only one entry. + */ +static int +setup_uio_ppods(struct toepcb *toep, const struct uio *uio, int oft, int *length) +{ + int err; + unsigned int len; + struct ddp_gather_list *gl = NULL; + struct ddp_state *p = &toep->tp_ddp_state; + struct iovec *iov = uio->uio_iov; + vm_offset_t addr = (vm_offset_t)iov->iov_base - oft; + +#ifdef notyet + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +#endif + if (__predict_false(p->ubuf_nppods == 0)) { + err = alloc_buf1_ppods(toep, p, addr, iov->iov_len + oft); + if (err) + return (err); + } + + len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE; + len -= addr & PAGE_MASK; + if (len > M_TCB_RX_DDP_BUF0_LEN) + len = M_TCB_RX_DDP_BUF0_LEN; + len = min(len, toep->tp_tp->rcv_wnd - 32768); + len = min(len, iov->iov_len + oft); + + if (len <= p->kbuf[0]->dgl_length) { + printf("length too short\n"); + return (EINVAL); + } + + err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf); + if (err) + return (err); + if (gl) { + if (p->ubuf) + t3_free_ddp_gl(p->ubuf); + p->ubuf = gl; + t3_setup_ppods(toep, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len, + gl->dgl_offset, 0); + } + *length = len; + return (0); +} + +/* + * + */ +void +t3_cancel_ubuf(struct toepcb *toep, struct sockbuf *rcv) +{ + struct ddp_state *p = &toep->tp_ddp_state; + int ubuf_pending = t3_ddp_ubuf_pending(toep); + int err = 0, count = 0; + + if (p->ubuf == NULL) + return; + + sockbuf_lock_assert(rcv); + + p->cancel_ubuf = 1; + while (ubuf_pending && !(rcv->sb_state & SBS_CANTRCVMORE)) { + CTR3(KTR_TOM, + "t3_cancel_ubuf: flags0 0x%x flags1 0x%x get_tcb_count %d", + p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), + p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY), + p->get_tcb_count); + if (p->get_tcb_count == 0) + t3_cancel_ddpbuf(toep, p->cur_buf); + else + CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d rcv=%p SBS_CANTRCVMORE=%d", + err, p->get_tcb_count, rcv->sb_timeo, rcv, + !!(rcv->sb_state & SBS_CANTRCVMORE)); + + while (p->get_tcb_count && !(rcv->sb_state & SBS_CANTRCVMORE)) { + if (count & 0xfffffff) + CTR5(KTR_TOM, "waiting err=%d get_tcb_count=%d timeo=%d rcv=%p count=%d", + err, p->get_tcb_count, rcv->sb_timeo, rcv, count); + count++; + err = sbwait(rcv); + } + ubuf_pending = t3_ddp_ubuf_pending(toep); + } + p->cancel_ubuf = 0; + p->user_ddp_pending = 0; + +} + +#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE0(1) | \ + V_TF_DDP_PSH_NO_INVALIDATE1(1) | \ + V_TF_DDP_BUF1_FLUSH(1) | \ + V_TF_DDP_BUF0_FLUSH(1) | \ + V_TF_DDP_PUSH_DISABLE_1(1) | \ + V_TF_DDP_PUSH_DISABLE_0(1) | \ + V_TF_DDP_INDICATE_OUT(1)) + +/* + * Post a user buffer as an overlay on top of the current kernel buffer. + */ +int +t3_overlay_ubuf(struct toepcb *toep, struct sockbuf *rcv, + const struct uio *uio, int nonblock, int rcv_flags, + int modulate, int post_kbuf) +{ + int err, len, ubuf_idx; + unsigned long flags; + struct ddp_state *p = &toep->tp_ddp_state; + + if (p->kbuf[0] == NULL) { + return (EINVAL); + } + sockbuf_unlock(rcv); + err = setup_uio_ppods(toep, uio, 0, &len); + sockbuf_lock(rcv); + if (err) + return (err); + + if ((rcv->sb_state & SBS_CANTRCVMORE) || + (toep->tp_tp->t_flags & TF_TOE) == 0) + return (EINVAL); + + ubuf_idx = p->kbuf_idx; + p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP; + /* Use existing offset */ + /* Don't need to update .gl, user buffer isn't copied. */ + p->cur_buf = ubuf_idx; + + flags = select_ddp_flags(toep, ubuf_idx, nonblock, rcv_flags); + + if (post_kbuf) { + struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1]; + + dbs->cur_offset = 0; + dbs->flags = 0; + dbs->gl = p->kbuf[ubuf_idx ^ 1]; + p->kbuf_idx ^= 1; + flags |= p->kbuf_idx ? + V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) : + V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0); + } + + if (ubuf_idx == 0) { + t3_overlay_ddpbuf(toep, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6, + len); + t3_setup_ddpbufs(toep, 0, 0, p->kbuf[1]->dgl_length, 0, + flags, + OVERLAY_MASK | flags, 1); + } else { + t3_overlay_ddpbuf(toep, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6, + len); + t3_setup_ddpbufs(toep, p->kbuf[0]->dgl_length, 0, 0, 0, + flags, + OVERLAY_MASK | flags, 1); + } +#ifdef T3_TRACE + T3_TRACE5(TIDTB(so), + "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d " + " kbuf_idx %d", + p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx); +#endif + CTR3(KTR_TOM, + "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x", + p->ubuf_tag, flags, OVERLAY_MASK); + CTR3(KTR_TOM, + "t3_overlay_ubuf: ubuf_idx %d kbuf_idx %d post_kbuf %d", + ubuf_idx, p->kbuf_idx, post_kbuf); + + return (0); +} + +/* + * Clean up DDP state that needs to survive until socket close time, such as the + * DDP buffers. The buffers are already unmapped at this point as unmapping + * needs the PCI device and a socket may close long after the device is removed. + */ +void +t3_cleanup_ddp(struct toepcb *toep) +{ + struct ddp_state *p = &toep->tp_ddp_state; + int idx; + + for (idx = 0; idx < NUM_DDP_KBUF; idx++) + if (p->kbuf[idx]) { + ddp_gl_free_pages(p->kbuf[idx], 0); + free(p->kbuf[idx], M_DEVBUF); + } + if (p->ubuf) { + ddp_gl_free_pages(p->ubuf, 0); + free(p->ubuf, M_DEVBUF); + p->ubuf = NULL; + } + toep->tp_ulp_mode = 0; +} + +/* + * This is a companion to t3_cleanup_ddp() and releases the HW resources + * associated with a connection's DDP state, such as the page pods. + * It's called when HW is done with a connection. The rest of the state + * remains available until both HW and the app are done with the connection. + */ +void +t3_release_ddp_resources(struct toepcb *toep) +{ + struct ddp_state *p = &toep->tp_ddp_state; + struct tom_data *d = TOM_DATA(toep->tp_toedev); + int idx; + + for (idx = 0; idx < NUM_DDP_KBUF; idx++) { + t3_free_ppods(d, p->kbuf_tag[idx], + p->kbuf_nppods[idx]); + unmap_ddp_gl(p->kbuf[idx]); + } + + if (p->ubuf_nppods) { + t3_free_ppods(d, p->ubuf_tag, p->ubuf_nppods); + p->ubuf_nppods = 0; + } + if (p->ubuf) + unmap_ddp_gl(p->ubuf); + +} + +void +t3_post_kbuf(struct toepcb *toep, int modulate, int nonblock) +{ + struct ddp_state *p = &toep->tp_ddp_state; + + t3_set_ddp_tag(toep, p->cur_buf, p->kbuf_tag[p->cur_buf] << 6); + t3_set_ddp_buf(toep, p->cur_buf, 0, p->kbuf[p->cur_buf]->dgl_length); + t3_repost_kbuf(toep, p->cur_buf, modulate, 1, nonblock); +#ifdef T3_TRACE + T3_TRACE1(TIDTB(so), + "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf); +#endif + CTR1(KTR_TOM, + "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf); +} + +/* + * Prepare a socket for DDP. Must be called when the socket is known to be + * open. + */ +int +t3_enter_ddp(struct toepcb *toep, unsigned int kbuf_size, unsigned int waitall, int nonblock) +{ + int i, err = ENOMEM; + static vm_pindex_t color; + unsigned int nppods, kbuf_pages, idx = 0; + struct ddp_state *p = &toep->tp_ddp_state; + struct tom_data *d = TOM_DATA(toep->tp_toedev); + + + if (kbuf_size > M_TCB_RX_DDP_BUF0_LEN) + return (EINVAL); + +#ifdef notyet + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +#endif + kbuf_pages = (kbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + nppods = pages2ppods(kbuf_pages); + + p->kbuf_noinval = !!waitall; + p->kbuf_tag[NUM_DDP_KBUF - 1] = -1; + for (idx = 0; idx < NUM_DDP_KBUF; idx++) { + p->kbuf[idx] = + malloc(sizeof (struct ddp_gather_list) + kbuf_pages * + sizeof(vm_page_t *), M_DEVBUF, M_NOWAIT|M_ZERO); + if (p->kbuf[idx] == NULL) + goto err; + err = t3_alloc_ppods(d, nppods, &p->kbuf_tag[idx]); + if (err) { + printf("t3_alloc_ppods failed err=%d\n", err); + goto err; + } + + p->kbuf_nppods[idx] = nppods; + p->kbuf[idx]->dgl_length = kbuf_size; + p->kbuf[idx]->dgl_offset = 0; + p->kbuf[idx]->dgl_nelem = kbuf_pages; + + for (i = 0; i < kbuf_pages; ++i) { + p->kbuf[idx]->dgl_pages[i] = vm_page_alloc(NULL, color, + VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | + VM_ALLOC_ZERO); + if (p->kbuf[idx]->dgl_pages[i] == NULL) { + p->kbuf[idx]->dgl_nelem = i; + printf("failed to allocate kbuf pages\n"); + goto err; + } + } +#ifdef NEED_BUSDMA + /* + * XXX we'll need this for VT-d or any platform with an iommu :-/ + * + */ + for (i = 0; i < kbuf_pages; ++i) + p->kbuf[idx]->phys_addr[i] = + pci_map_page(p->pdev, p->kbuf[idx]->pages[i], + 0, PAGE_SIZE, PCI_DMA_FROMDEVICE); +#endif + t3_setup_ppods(toep, p->kbuf[idx], nppods, p->kbuf_tag[idx], + p->kbuf[idx]->dgl_length, 0, 0); + } + cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid); + + t3_set_ddp_tag(toep, 0, p->kbuf_tag[0] << 6); + t3_set_ddp_buf(toep, 0, 0, p->kbuf[0]->dgl_length); + t3_repost_kbuf(toep, 0, 0, 1, nonblock); + + t3_set_rcv_coalesce_enable(toep, + TOM_TUNABLE(toep->tp_toedev, ddp_rcvcoalesce)); + t3_set_dack_mss(toep, TOM_TUNABLE(toep->tp_toedev, delack)>>1); + +#ifdef T3_TRACE + T3_TRACE4(TIDTB(so), + "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d", + kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]); +#endif + CTR4(KTR_TOM, + "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d", + kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]); + cxgb_log_tcb(TOEP_T3C_DEV(toep)->adapter, toep->tp_tid); + return (0); + +err: + t3_release_ddp_resources(toep); + t3_cleanup_ddp(toep); + return (err); +} + +int +t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len) +{ + int resid_init, err; + struct ddp_gather_list *gl = (struct ddp_gather_list *)m->m_ddp_gl; + + resid_init = uio->uio_resid; + + if (!gl->dgl_pages) + panic("pages not set\n"); + + CTR4(KTR_TOM, "t3_ddp_copy: offset=%d dgl_offset=%d cur_offset=%d len=%d", + offset, gl->dgl_offset, m->m_cur_offset, len); + offset += gl->dgl_offset + m->m_cur_offset; + KASSERT(len <= gl->dgl_length, + ("len=%d > dgl_length=%d in ddp_copy\n", len, gl->dgl_length)); + + + err = uiomove_fromphys(gl->dgl_pages, offset, len, uio); + return (err); +} + + +/* + * Allocate n page pods. Returns -1 on failure or the page pod tag. + */ +int +t3_alloc_ppods(struct tom_data *td, unsigned int n, int *ptag) +{ + unsigned int i, j; + + if (__predict_false(!td->ppod_map)) { + printf("ppod_map not set\n"); + return (EINVAL); + } + + mtx_lock(&td->ppod_map_lock); + for (i = 0; i < td->nppods; ) { + + for (j = 0; j < n; ++j) /* scan ppod_map[i..i+n-1] */ + if (td->ppod_map[i + j]) { + i = i + j + 1; + goto next; + } + memset(&td->ppod_map[i], 1, n); /* allocate range */ + mtx_unlock(&td->ppod_map_lock); + CTR2(KTR_TOM, + "t3_alloc_ppods: n=%u tag=%u", n, i); + *ptag = i; + return (0); + next: ; + } + mtx_unlock(&td->ppod_map_lock); + return (0); +} + +void +t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n) +{ + /* No need to take ppod_lock here */ + memset(&td->ppod_map[tag], 0, n); +} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_defs.h b/sys/dev/cxgb/ulp/tom/cxgb_defs.h new file mode 100644 index 0000000000000..8c14f5ae89c87 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_defs.h @@ -0,0 +1,90 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ +#ifndef CXGB_DEFS_H_ +#define CXGB_DEFS_H_ + +#define VALIDATE_TID 0 + +#define TOEPCB(so) ((struct toepcb *)(sototcpcb((so))->t_toe)) +#define TOE_DEV(so) (TOEPCB((so))->tp_toedev) +#define toeptoso(toep) ((toep)->tp_tp->t_inpcb->inp_socket) +#define sototoep(so) (sototcpcb((so))->t_toe) + +#define TRACE_ENTER printf("%s:%s entered\n", __FUNCTION__, __FILE__) +#define TRACE_EXIT printf("%s:%s:%d exited\n", __FUNCTION__, __FILE__, __LINE__) + +#define KTR_TOM KTR_SPARE2 +#define KTR_TCB KTR_SPARE3 + +struct toepcb; +struct listen_ctx; + +typedef void (*defer_handler_t)(struct toedev *dev, struct mbuf *m); + +void t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h); +void t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev); +void t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev); +int t3_push_frames(struct socket *so, int req_completion); +int t3_connect(struct toedev *tdev, struct socket *so, struct rtentry *rt, + struct sockaddr *nam); +void t3_init_listen_cpl_handlers(void); +int t3_init_cpl_io(void); +void t3_init_wr_tab(unsigned int wr_len); +uint32_t t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail); +void t3_send_rx_modulate(struct toepcb *toep); +void t3_cleanup_rbuf(struct tcpcb *tp, int copied); + +void t3_init_socket_ops(void); +void t3_install_socket_ops(struct socket *so); + + +void t3_disconnect_acceptq(struct socket *listen_so); +void t3_reset_synq(struct listen_ctx *ctx); +void t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler); + +struct toepcb *toepcb_alloc(void); +void toepcb_hold(struct toepcb *); +void toepcb_release(struct toepcb *); +void toepcb_init(struct toepcb *); + +void t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off); +void t3_set_dack_mss(struct toepcb *toep, int on); +void t3_set_keepalive(struct toepcb *toep, int on_off); +void t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag); +void t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, + unsigned int len); +int t3_get_tcb(struct toepcb *toep); + +int t3_ctloutput(struct socket *so, struct sockopt *sopt); + +#endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_l2t.c b/sys/dev/cxgb/ulp/tom/cxgb_l2t.c new file mode 100644 index 0000000000000..ab5fbe740114b --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_l2t.c @@ -0,0 +1,542 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/bus.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#if __FreeBSD_version > 700000 +#include <sys/rwlock.h> +#endif + +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <net/if.h> +#include <net/ethernet.h> +#include <net/if_vlan_var.h> +#include <net/if_dl.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/if_ether.h> + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#else +#include <dev/cxgb/cxgb_include.h> +#endif + +#define VLAN_NONE 0xfff +#define SDL(s) ((struct sockaddr_dl *)s) +#define RT_ENADDR(sa) ((u_char *)LLADDR(SDL((sa)))) +#define rt_expire rt_rmx.rmx_expire + +struct llinfo_arp { + struct callout la_timer; + struct rtentry *la_rt; + struct mbuf *la_hold; /* last packet until resolved/timeout */ + u_short la_preempt; /* countdown for pre-expiry arps */ + u_short la_asked; /* # requests sent */ +}; + +/* + * Module locking notes: There is a RW lock protecting the L2 table as a + * whole plus a spinlock per L2T entry. Entry lookups and allocations happen + * under the protection of the table lock, individual entry changes happen + * while holding that entry's spinlock. The table lock nests outside the + * entry locks. Allocations of new entries take the table lock as writers so + * no other lookups can happen while allocating new entries. Entry updates + * take the table lock as readers so multiple entries can be updated in + * parallel. An L2T entry can be dropped by decrementing its reference count + * and therefore can happen in parallel with entry allocation but no entry + * can change state or increment its ref count during allocation as both of + * these perform lookups. + */ + +static inline unsigned int +vlan_prio(const struct l2t_entry *e) +{ + return e->vlan >> 13; +} + +static inline unsigned int +arp_hash(u32 key, int ifindex, const struct l2t_data *d) +{ + return jhash_2words(key, ifindex, 0) & (d->nentries - 1); +} + +static inline void +neigh_replace(struct l2t_entry *e, struct rtentry *rt) +{ + RT_LOCK(rt); + RT_ADDREF(rt); + RT_UNLOCK(rt); + + if (e->neigh) + RTFREE(e->neigh); + e->neigh = rt; +} + +/* + * Set up an L2T entry and send any packets waiting in the arp queue. The + * supplied mbuf is used for the CPL_L2T_WRITE_REQ. Must be called with the + * entry locked. + */ +static int +setup_l2e_send_pending(struct t3cdev *dev, struct mbuf *m, + struct l2t_entry *e) +{ + struct cpl_l2t_write_req *req; + + if (!m) { + if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) + return (ENOMEM); + } + /* + * XXX MH_ALIGN + */ + req = mtod(m, struct cpl_l2t_write_req *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx)); + req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) | + V_L2T_W_VLAN(e->vlan & EVL_VLID_MASK) | + V_L2T_W_PRIO(vlan_prio(e))); + + memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); + m_set_priority(m, CPL_PRIORITY_CONTROL); + cxgb_ofld_send(dev, m); + while (e->arpq_head) { + m = e->arpq_head; + e->arpq_head = m->m_next; + m->m_next = NULL; + cxgb_ofld_send(dev, m); + } + e->arpq_tail = NULL; + e->state = L2T_STATE_VALID; + + return 0; +} + +/* + * Add a packet to the an L2T entry's queue of packets awaiting resolution. + * Must be called with the entry's lock held. + */ +static inline void +arpq_enqueue(struct l2t_entry *e, struct mbuf *m) +{ + m->m_next = NULL; + if (e->arpq_head) + e->arpq_tail->m_next = m; + else + e->arpq_head = m; + e->arpq_tail = m; +} + +int +t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e) +{ + struct rtentry *rt = e->neigh; + struct sockaddr_in sin; + + bzero(&sin, sizeof(struct sockaddr_in)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr.s_addr = e->addr; + + CTR2(KTR_CXGB, "send slow on rt=%p eaddr=0x%08x\n", rt, e->addr); +again: + switch (e->state) { + case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ + arpresolve(rt->rt_ifp, rt, NULL, + (struct sockaddr *)&sin, e->dmac); + mtx_lock(&e->lock); + if (e->state == L2T_STATE_STALE) + e->state = L2T_STATE_VALID; + mtx_unlock(&e->lock); + case L2T_STATE_VALID: /* fast-path, send the packet on */ + return cxgb_ofld_send(dev, m); + case L2T_STATE_RESOLVING: + mtx_lock(&e->lock); + if (e->state != L2T_STATE_RESOLVING) { // ARP already completed + mtx_unlock(&e->lock); + goto again; + } + arpq_enqueue(e, m); + mtx_unlock(&e->lock); + /* + * Only the first packet added to the arpq should kick off + * resolution. However, because the m_gethdr below can fail, + * we allow each packet added to the arpq to retry resolution + * as a way of recovering from transient memory exhaustion. + * A better way would be to use a work request to retry L2T + * entries when there's no memory. + */ + if (arpresolve(rt->rt_ifp, rt, NULL, + (struct sockaddr *)&sin, e->dmac) == 0) { + CTR6(KTR_CXGB, "mac=%x:%x:%x:%x:%x:%x\n", + e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]); + + if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) + return (ENOMEM); + + mtx_lock(&e->lock); + if (e->arpq_head) + setup_l2e_send_pending(dev, m, e); + else + m_freem(m); + mtx_unlock(&e->lock); + } + } + return 0; +} + +void +t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e) +{ + struct rtentry *rt; + struct mbuf *m0; + struct sockaddr_in sin; + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr.s_addr = e->addr; + + if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) + return; + + rt = e->neigh; +again: + switch (e->state) { + case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ + arpresolve(rt->rt_ifp, rt, NULL, + (struct sockaddr *)&sin, e->dmac); + mtx_lock(&e->lock); + if (e->state == L2T_STATE_STALE) { + e->state = L2T_STATE_VALID; + } + mtx_unlock(&e->lock); + return; + case L2T_STATE_VALID: /* fast-path, send the packet on */ + return; + case L2T_STATE_RESOLVING: + mtx_lock(&e->lock); + if (e->state != L2T_STATE_RESOLVING) { // ARP already completed + mtx_unlock(&e->lock); + goto again; + } + mtx_unlock(&e->lock); + + /* + * Only the first packet added to the arpq should kick off + * resolution. However, because the alloc_skb below can fail, + * we allow each packet added to the arpq to retry resolution + * as a way of recovering from transient memory exhaustion. + * A better way would be to use a work request to retry L2T + * entries when there's no memory. + */ + arpresolve(rt->rt_ifp, rt, NULL, + (struct sockaddr *)&sin, e->dmac); + + } + return; +} +/* + * Allocate a free L2T entry. Must be called with l2t_data.lock held. + */ +static struct l2t_entry * +alloc_l2e(struct l2t_data *d) +{ + struct l2t_entry *end, *e, **p; + + if (!atomic_load_acq_int(&d->nfree)) + return NULL; + + /* there's definitely a free entry */ + for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e) + if (atomic_load_acq_int(&e->refcnt) == 0) + goto found; + + for (e = &d->l2tab[1]; atomic_load_acq_int(&e->refcnt); ++e) ; +found: + d->rover = e + 1; + atomic_add_int(&d->nfree, -1); + + /* + * The entry we found may be an inactive entry that is + * presently in the hash table. We need to remove it. + */ + if (e->state != L2T_STATE_UNUSED) { + int hash = arp_hash(e->addr, e->ifindex, d); + + for (p = &d->l2tab[hash].first; *p; p = &(*p)->next) + if (*p == e) { + *p = e->next; + break; + } + e->state = L2T_STATE_UNUSED; + } + + return e; +} + +/* + * Called when an L2T entry has no more users. The entry is left in the hash + * table since it is likely to be reused but we also bump nfree to indicate + * that the entry can be reallocated for a different neighbor. We also drop + * the existing neighbor reference in case the neighbor is going away and is + * waiting on our reference. + * + * Because entries can be reallocated to other neighbors once their ref count + * drops to 0 we need to take the entry's lock to avoid races with a new + * incarnation. + */ +void +t3_l2e_free(struct l2t_data *d, struct l2t_entry *e) +{ + struct rtentry *rt = NULL; + + mtx_lock(&e->lock); + if (atomic_load_acq_int(&e->refcnt) == 0) { /* hasn't been recycled */ + rt = e->neigh; + e->neigh = NULL; + } + + mtx_unlock(&e->lock); + atomic_add_int(&d->nfree, 1); + if (rt) + RTFREE(rt); +} + + +/* + * Update an L2T entry that was previously used for the same next hop as neigh. + * Must be called with softirqs disabled. + */ +static inline void +reuse_entry(struct l2t_entry *e, struct rtentry *neigh) +{ + struct llinfo_arp *la; + + la = (struct llinfo_arp *)neigh->rt_llinfo; + + mtx_lock(&e->lock); /* avoid race with t3_l2t_free */ + if (neigh != e->neigh) + neigh_replace(e, neigh); + + if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), sizeof(e->dmac)) || + (neigh->rt_expire > time_uptime)) + e->state = L2T_STATE_RESOLVING; + else if (la->la_hold == NULL) + e->state = L2T_STATE_VALID; + else + e->state = L2T_STATE_STALE; + mtx_unlock(&e->lock); +} + +struct l2t_entry * +t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh, struct ifnet *ifp, + struct sockaddr *sa) +{ + struct l2t_entry *e; + struct l2t_data *d = L2DATA(dev); + u32 addr = ((struct sockaddr_in *)sa)->sin_addr.s_addr; + int ifidx = neigh->rt_ifp->if_index; + int hash = arp_hash(addr, ifidx, d); + unsigned int smt_idx = ((struct port_info *)ifp->if_softc)->port_id; + + rw_wlock(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) + if (e->addr == addr && e->ifindex == ifidx && + e->smt_idx == smt_idx) { + l2t_hold(d, e); + if (atomic_load_acq_int(&e->refcnt) == 1) + reuse_entry(e, neigh); + goto done; + } + + /* Need to allocate a new entry */ + e = alloc_l2e(d); + if (e) { + mtx_lock(&e->lock); /* avoid race with t3_l2t_free */ + e->next = d->l2tab[hash].first; + d->l2tab[hash].first = e; + rw_wunlock(&d->lock); + + e->state = L2T_STATE_RESOLVING; + e->addr = addr; + e->ifindex = ifidx; + e->smt_idx = smt_idx; + atomic_store_rel_int(&e->refcnt, 1); + e->neigh = NULL; + + + neigh_replace(e, neigh); +#ifdef notyet + /* + * XXX need to add accessor function for vlan tag + */ + if (neigh->rt_ifp->if_vlantrunk) + e->vlan = VLAN_DEV_INFO(neigh->dev)->vlan_id; + else +#endif + e->vlan = VLAN_NONE; + mtx_unlock(&e->lock); + + return (e); + } + +done: + rw_wunlock(&d->lock); + return e; +} + +/* + * Called when address resolution fails for an L2T entry to handle packets + * on the arpq head. If a packet specifies a failure handler it is invoked, + * otherwise the packets is sent to the TOE. + * + * XXX: maybe we should abandon the latter behavior and just require a failure + * handler. + */ +static void +handle_failed_resolution(struct t3cdev *dev, struct mbuf *arpq) +{ + + while (arpq) { + struct mbuf *m = arpq; +#ifdef notyet + struct l2t_mbuf_cb *cb = L2T_MBUF_CB(m); +#endif + arpq = m->m_next; + m->m_next = NULL; +#ifdef notyet + if (cb->arp_failure_handler) + cb->arp_failure_handler(dev, m); + else +#endif + cxgb_ofld_send(dev, m); + } + +} + +void +t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, + uint8_t *enaddr, struct sockaddr *sa) +{ + struct l2t_entry *e; + struct mbuf *arpq = NULL; + struct l2t_data *d = L2DATA(dev); + u32 addr = *(u32 *) &((struct sockaddr_in *)sa)->sin_addr; + int ifidx = neigh->rt_ifp->if_index; + int hash = arp_hash(addr, ifidx, d); + struct llinfo_arp *la; + + rw_rlock(&d->lock); + for (e = d->l2tab[hash].first; e; e = e->next) + if (e->addr == addr && e->ifindex == ifidx) { + mtx_lock(&e->lock); + goto found; + } + rw_runlock(&d->lock); + CTR1(KTR_CXGB, "t3_l2t_update: addr=0x%08x not found", addr); + return; + +found: + printf("found 0x%08x\n", addr); + + rw_runlock(&d->lock); + memcpy(e->dmac, enaddr, ETHER_ADDR_LEN); + printf("mac=%x:%x:%x:%x:%x:%x\n", + e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]); + + if (atomic_load_acq_int(&e->refcnt)) { + if (neigh != e->neigh) + neigh_replace(e, neigh); + + la = (struct llinfo_arp *)neigh->rt_llinfo; + if (e->state == L2T_STATE_RESOLVING) { + + if (la->la_asked >= 5 /* arp_maxtries */) { + arpq = e->arpq_head; + e->arpq_head = e->arpq_tail = NULL; + } else + setup_l2e_send_pending(dev, NULL, e); + } else { + e->state = L2T_STATE_VALID; + if (memcmp(e->dmac, RT_ENADDR(neigh->rt_gateway), 6)) + setup_l2e_send_pending(dev, NULL, e); + } + } + mtx_unlock(&e->lock); + + if (arpq) + handle_failed_resolution(dev, arpq); +} + +struct l2t_data * +t3_init_l2t(unsigned int l2t_capacity) +{ + struct l2t_data *d; + int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry); + + d = cxgb_alloc_mem(size); + if (!d) + return NULL; + + d->nentries = l2t_capacity; + d->rover = &d->l2tab[1]; /* entry 0 is not used */ + atomic_store_rel_int(&d->nfree, l2t_capacity - 1); + rw_init(&d->lock, "L2T"); + + for (i = 0; i < l2t_capacity; ++i) { + d->l2tab[i].idx = i; + d->l2tab[i].state = L2T_STATE_UNUSED; + mtx_init(&d->l2tab[i].lock, "L2TAB", NULL, MTX_DEF); + atomic_store_rel_int(&d->l2tab[i].refcnt, 0); + } + return d; +} + +void +t3_free_l2t(struct l2t_data *d) +{ + int i; + + rw_destroy(&d->lock); + for (i = 0; i < d->nentries; ++i) + mtx_destroy(&d->l2tab[i].lock); + + cxgb_free_mem(d); +} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_l2t.h b/sys/dev/cxgb/ulp/tom/cxgb_l2t.h new file mode 100644 index 0000000000000..3575f6fa98b14 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_l2t.h @@ -0,0 +1,161 @@ +/************************************************************************** + +Copyright (c) 2007-2008, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ +#ifndef _CHELSIO_L2T_H +#define _CHELSIO_L2T_H + +#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> +#include <sys/lock.h> + +#if __FreeBSD_version > 700000 +#include <sys/rwlock.h> +#else +#define rwlock mtx +#define rw_wlock(x) mtx_lock((x)) +#define rw_wunlock(x) mtx_unlock((x)) +#define rw_rlock(x) mtx_lock((x)) +#define rw_runlock(x) mtx_unlock((x)) +#define rw_init(x, str) mtx_init((x), (str), NULL, MTX_DEF) +#define rw_destroy(x) mtx_destroy((x)) +#endif + +enum { + L2T_STATE_VALID, /* entry is up to date */ + L2T_STATE_STALE, /* entry may be used but needs revalidation */ + L2T_STATE_RESOLVING, /* entry needs address resolution */ + L2T_STATE_UNUSED /* entry not in use */ +}; + +/* + * Each L2T entry plays multiple roles. First of all, it keeps state for the + * corresponding entry of the HW L2 table and maintains a queue of offload + * packets awaiting address resolution. Second, it is a node of a hash table + * chain, where the nodes of the chain are linked together through their next + * pointer. Finally, each node is a bucket of a hash table, pointing to the + * first element in its chain through its first pointer. + */ +struct l2t_entry { + uint16_t state; /* entry state */ + uint16_t idx; /* entry index */ + uint32_t addr; /* dest IP address */ + int ifindex; /* neighbor's net_device's ifindex */ + uint16_t smt_idx; /* SMT index */ + uint16_t vlan; /* VLAN TCI (id: bits 0-11, prio: 13-15 */ + struct rtentry *neigh; /* associated neighbour */ + struct l2t_entry *first; /* start of hash chain */ + struct l2t_entry *next; /* next l2t_entry on chain */ + struct mbuf *arpq_head; /* queue of packets awaiting resolution */ + struct mbuf *arpq_tail; + struct mtx lock; + volatile uint32_t refcnt; /* entry reference count */ + uint8_t dmac[6]; /* neighbour's MAC address */ +}; + +struct l2t_data { + unsigned int nentries; /* number of entries */ + struct l2t_entry *rover; /* starting point for next allocation */ + volatile uint32_t nfree; /* number of free entries */ + struct rwlock lock; + struct l2t_entry l2tab[0]; +}; + +typedef void (*arp_failure_handler_func)(struct t3cdev *dev, + struct mbuf *m); + +typedef void (*opaque_arp_failure_handler_func)(void *dev, + struct mbuf *m); + +/* + * Callback stored in an skb to handle address resolution failure. + */ +struct l2t_mbuf_cb { + arp_failure_handler_func arp_failure_handler; +}; + +/* + * XXX + */ +#define L2T_MBUF_CB(skb) ((struct l2t_mbuf_cb *)(skb)->cb) + + +static __inline void set_arp_failure_handler(struct mbuf *m, + arp_failure_handler_func hnd) +{ + m->m_pkthdr.header = (opaque_arp_failure_handler_func)hnd; + +} + +/* + * Getting to the L2 data from an offload device. + */ +#define L2DATA(dev) ((dev)->l2opt) + +void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e); +void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa); +struct l2t_entry *t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh, + struct ifnet *ifp, struct sockaddr *sa); +int t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, + struct l2t_entry *e); +void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e); +struct l2t_data *t3_init_l2t(unsigned int l2t_capacity); +void t3_free_l2t(struct l2t_data *d); + +#ifdef CONFIG_PROC_FS +int t3_l2t_proc_setup(struct proc_dir_entry *dir, struct l2t_data *d); +void t3_l2t_proc_free(struct proc_dir_entry *dir); +#else +#define l2t_proc_setup(dir, d) 0 +#define l2t_proc_free(dir) +#endif + +int cxgb_ofld_send(struct t3cdev *dev, struct mbuf *m); + +static inline int l2t_send(struct t3cdev *dev, struct mbuf *m, + struct l2t_entry *e) +{ + if (__predict_true(e->state == L2T_STATE_VALID)) { + return cxgb_ofld_send(dev, (struct mbuf *)m); + } + return t3_l2t_send_slow(dev, (struct mbuf *)m, e); +} + +static inline void l2t_release(struct l2t_data *d, struct l2t_entry *e) +{ + if (atomic_fetchadd_int(&e->refcnt, -1) == 1) + t3_l2e_free(d, e); +} + +static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e) +{ + if (atomic_fetchadd_int(&e->refcnt, 1) == 1) /* 0 -> 1 transition */ + atomic_add_int(&d->nfree, 1); +} + +#endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c new file mode 100644 index 0000000000000..1d15cf292dcd3 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c @@ -0,0 +1,338 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> + + +#include <dev/cxgb/cxgb_osdep.h> +#include <dev/cxgb/sys/mbufq.h> + +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_fsm.h> + +#include <netinet/tcp_offload.h> +#include <net/route.h> + +#include <dev/cxgb/t3cdev.h> +#include <dev/cxgb/common/cxgb_firmware_exports.h> +#include <dev/cxgb/common/cxgb_t3_cpl.h> +#include <dev/cxgb/common/cxgb_tcb.h> +#include <dev/cxgb/common/cxgb_ctl_defs.h> +#include <dev/cxgb/cxgb_offload.h> +#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> +#include <dev/cxgb/ulp/tom/cxgb_defs.h> +#include <dev/cxgb/ulp/tom/cxgb_tom.h> +#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> + + +static struct listen_info *listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid); +static int listen_hash_del(struct tom_data *d, struct socket *so); + +/* + * Process a CPL_CLOSE_LISTSRV_RPL message. If the status is good we release + * the STID. + */ +static int +do_close_server_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct cpl_close_listserv_rpl *rpl = cplhdr(m); + unsigned int stid = GET_TID(rpl); + + if (rpl->status != CPL_ERR_NONE) + log(LOG_ERR, "Unexpected CLOSE_LISTSRV_RPL status %u for " + "STID %u\n", rpl->status, stid); + else { + struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; + + cxgb_free_stid(cdev, stid); + free(listen_ctx, M_CXGB); + } + + return (CPL_RET_BUF_DONE); +} + +/* + * Process a CPL_PASS_OPEN_RPL message. Remove the socket from the listen hash + * table and free the STID if there was any error, otherwise nothing to do. + */ +static int +do_pass_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct cpl_pass_open_rpl *rpl = cplhdr(m); + + if (rpl->status != CPL_ERR_NONE) { + int stid = GET_TID(rpl); + struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; + struct tom_data *d = listen_ctx->tom_data; + struct socket *lso = listen_ctx->lso; + +#if VALIDATE_TID + if (!lso) + return (CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE); +#endif + /* + * Note: It is safe to unconditionally call listen_hash_del() + * at this point without risking unhashing a reincarnation of + * an already closed socket (i.e., there is no listen, close, + * listen, free the sock for the second listen while processing + * a message for the first race) because we are still holding + * a reference on the socket. It is possible that the unhash + * will fail because the socket is already closed, but we can't + * unhash the wrong socket because it is impossible for the + * socket to which this message refers to have reincarnated. + */ + listen_hash_del(d, lso); + cxgb_free_stid(cdev, stid); +#ifdef notyet + /* + * XXX need to unreference the inpcb + * but we have no way of knowing that other TOMs aren't referencing it + */ + sock_put(lso); +#endif + free(listen_ctx, M_CXGB); + } + return CPL_RET_BUF_DONE; +} + +void +t3_init_listen_cpl_handlers(void) +{ + t3tom_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); + t3tom_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); +} + +static inline int +listen_hashfn(const struct socket *so) +{ + return ((unsigned long)so >> 10) & (LISTEN_INFO_HASH_SIZE - 1); +} + +/* + * Create and add a listen_info entry to the listen hash table. This and the + * listen hash table functions below cannot be called from softirqs. + */ +static struct listen_info * +listen_hash_add(struct tom_data *d, struct socket *so, unsigned int stid) +{ + struct listen_info *p; + + p = malloc(sizeof(*p), M_CXGB, M_NOWAIT|M_ZERO); + if (p) { + int bucket = listen_hashfn(so); + + p->so = so; /* just a key, no need to take a reference */ + p->stid = stid; + mtx_lock(&d->listen_lock); + p->next = d->listen_hash_tab[bucket]; + d->listen_hash_tab[bucket] = p; + mtx_unlock(&d->listen_lock); + } + return p; +} + +/* + * Given a pointer to a listening socket return its server TID by consulting + * the socket->stid map. Returns -1 if the socket is not in the map. + */ +static int +listen_hash_find(struct tom_data *d, struct socket *so) +{ + int stid = -1, bucket = listen_hashfn(so); + struct listen_info *p; + + mtx_lock(&d->listen_lock); + for (p = d->listen_hash_tab[bucket]; p; p = p->next) + if (p->so == so) { + stid = p->stid; + break; + } + mtx_unlock(&d->listen_lock); + return stid; +} + +/* + * Delete the listen_info structure for a listening socket. Returns the server + * TID for the socket if it is present in the socket->stid map, or -1. + */ +static int +listen_hash_del(struct tom_data *d, struct socket *so) +{ + int bucket, stid = -1; + struct listen_info *p, **prev; + + bucket = listen_hashfn(so); + prev = &d->listen_hash_tab[bucket]; + + mtx_lock(&d->listen_lock); + for (p = *prev; p; prev = &p->next, p = p->next) + if (p->so == so) { + stid = p->stid; + *prev = p->next; + free(p, M_CXGB); + break; + } + mtx_unlock(&d->listen_lock); + + return (stid); +} + +/* + * Start a listening server by sending a passive open request to HW. + */ +void +t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev) +{ + int stid; + struct mbuf *m; + struct cpl_pass_open_req *req; + struct tom_data *d = TOM_DATA(dev); + struct inpcb *inp = sotoinpcb(so); + struct listen_ctx *ctx; + + if (!TOM_TUNABLE(dev, activated)) + return; + + if (listen_hash_find(d, so) != -1) + return; + + CTR1(KTR_TOM, "start listen on port %u", ntohs(inp->inp_lport)); + ctx = malloc(sizeof(*ctx), M_CXGB, M_NOWAIT|M_ZERO); + + if (!ctx) + return; + + ctx->tom_data = d; + ctx->lso = so; + ctx->ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) ? ULP_MODE_TCPDDP : 0; + LIST_INIT(&ctx->synq_head); + + stid = cxgb_alloc_stid(d->cdev, d->client, ctx); + if (stid < 0) + goto free_ctx; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) + goto free_stid; + m->m_pkthdr.len = m->m_len = sizeof(*req); + + if (!listen_hash_add(d, so, stid)) + goto free_all; + + req = mtod(m, struct cpl_pass_open_req *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid)); + req->local_port = inp->inp_lport; + memcpy(&req->local_ip, &inp->inp_laddr, 4); + req->peer_port = 0; + req->peer_ip = 0; + req->peer_netmask = 0; + req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); + req->opt0l = htonl(V_RCV_BUFSIZ(16)); + req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); + + m_set_priority(m, CPL_PRIORITY_LISTEN); + cxgb_ofld_send(cdev, m); + return; + +free_all: + m_free(m); +free_stid: + cxgb_free_stid(cdev, stid); +#if 0 + sock_put(sk); +#endif +free_ctx: + free(ctx, M_CXGB); +} + +/* + * Stop a listening server by sending a close_listsvr request to HW. + * The server TID is freed when we get the reply. + */ +void +t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev) +{ + struct mbuf *m; + struct cpl_close_listserv_req *req; + struct listen_ctx *lctx; + int stid = listen_hash_del(TOM_DATA(dev), so); + + if (stid < 0) + return; + + lctx = cxgb_get_lctx(cdev, stid); + /* + * Do this early so embryonic connections are marked as being aborted + * while the stid is still open. This ensures pass_establish messages + * that arrive while we are closing the server will be able to locate + * the listening socket. + */ + t3_reset_synq(lctx); + + /* Send the close ASAP to stop further passive opens */ + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + /* + * XXX allocate from lowmem cache + */ + } + m->m_pkthdr.len = m->m_len = sizeof(*req); + + req = mtod(m, struct cpl_close_listserv_req *); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid)); + req->cpu_idx = 0; + m_set_priority(m, CPL_PRIORITY_LISTEN); + cxgb_ofld_send(cdev, m); + + t3_disconnect_acceptq(so); +} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h new file mode 100644 index 0000000000000..2cbfa7b38b28f --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h @@ -0,0 +1,181 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ + +#ifndef T3_DDP_H +#define T3_DDP_H + +/* Should be 1 or 2 indicating single or double kernel buffers. */ +#define NUM_DDP_KBUF 2 + +/* min receive window for a connection to be considered for DDP */ +#define MIN_DDP_RCV_WIN (48 << 10) + +/* amount of Rx window not available to DDP to avoid window exhaustion */ +#define DDP_RSVD_WIN (16 << 10) + +/* # of sentinel invalid page pods at the end of a group of valid page pods */ +#define NUM_SENTINEL_PPODS 0 + +/* # of pages a pagepod can hold without needing another pagepod */ +#define PPOD_PAGES 4 + +/* page pods are allocated in groups of this size (must be power of 2) */ +#define PPOD_CLUSTER_SIZE 16 + +/* for each TID we reserve this many page pods up front */ +#define RSVD_PPODS_PER_TID 1 + +struct pagepod { + uint32_t pp_vld_tid; + uint32_t pp_pgsz_tag_color; + uint32_t pp_max_offset; + uint32_t pp_page_offset; + uint64_t pp_rsvd; + uint64_t pp_addr[5]; +}; + +#define PPOD_SIZE sizeof(struct pagepod) + +#define S_PPOD_TID 0 +#define M_PPOD_TID 0xFFFFFF +#define V_PPOD_TID(x) ((x) << S_PPOD_TID) + +#define S_PPOD_VALID 24 +#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID) +#define F_PPOD_VALID V_PPOD_VALID(1U) + +#define S_PPOD_COLOR 0 +#define M_PPOD_COLOR 0x3F +#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR) + +#define S_PPOD_TAG 6 +#define M_PPOD_TAG 0xFFFFFF +#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG) + +#define S_PPOD_PGSZ 30 +#define M_PPOD_PGSZ 0x3 +#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ) + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <machine/bus.h> + +/* DDP gather lists can specify an offset only for the first page. */ +struct ddp_gather_list { + unsigned int dgl_length; + unsigned int dgl_offset; + unsigned int dgl_nelem; + vm_page_t dgl_pages[0]; +}; + +struct ddp_buf_state { + unsigned int cur_offset; /* offset of latest DDP notification */ + unsigned int flags; + struct ddp_gather_list *gl; +}; + +struct ddp_state { + struct ddp_buf_state buf_state[2]; /* per buffer state */ + int cur_buf; + unsigned short kbuf_noinval; + unsigned short kbuf_idx; /* which HW buffer is used for kbuf */ + struct ddp_gather_list *ubuf; + int user_ddp_pending; + unsigned int ubuf_nppods; /* # of page pods for buffer 1 */ + unsigned int ubuf_tag; + unsigned int ubuf_ddp_ready; + int cancel_ubuf; + int get_tcb_count; + unsigned int kbuf_posted; + unsigned int kbuf_nppods[NUM_DDP_KBUF]; + unsigned int kbuf_tag[NUM_DDP_KBUF]; + struct ddp_gather_list *kbuf[NUM_DDP_KBUF]; /* kernel buffer for DDP prefetch */ +}; + +/* buf_state flags */ +enum { + DDP_BF_NOINVAL = 1 << 0, /* buffer is set to NO_INVALIDATE */ + DDP_BF_NOCOPY = 1 << 1, /* DDP to final dest, no copy needed */ + DDP_BF_NOFLIP = 1 << 2, /* buffer flips after GET_TCB_RPL */ + DDP_BF_PSH = 1 << 3, /* set in skb->flags if the a DDP was + completed with a segment having the + PSH flag set */ + DDP_BF_NODATA = 1 << 4, /* buffer completed before filling */ +}; + +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> +struct sockbuf; + +/* + * Returns 1 if a UBUF DMA buffer might be active. + */ +static inline int +t3_ddp_ubuf_pending(struct toepcb *toep) +{ + struct ddp_state *p = &toep->tp_ddp_state; + + /* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP, + * but DDP_STATE() is only valid if the connection actually enabled + * DDP. + */ + if (p->kbuf[0] == NULL) + return (0); + + return (p->buf_state[0].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)) || + (p->buf_state[1].flags & (DDP_BF_NOFLIP | DDP_BF_NOCOPY)); +} + +int t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, + unsigned int nppods, unsigned int tag, unsigned int maxoff, + unsigned int pg_off, unsigned int color); +int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag); +void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n); +void t3_free_ddp_gl(struct ddp_gather_list *gl); +int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len); +//void t3_repost_kbuf(struct socket *so, int modulate, int activate); +void t3_post_kbuf(struct toepcb *toep, int modulate, int nonblock); +int t3_post_ubuf(struct toepcb *toep, const struct uio *uio, int nonblock, + int rcv_flags, int modulate, int post_kbuf); +void t3_cancel_ubuf(struct toepcb *toep, struct sockbuf *rcv); +int t3_overlay_ubuf(struct toepcb *toep, struct sockbuf *rcv, + const struct uio *uio, int nonblock, + int rcv_flags, int modulate, int post_kbuf); +int t3_enter_ddp(struct toepcb *toep, unsigned int kbuf_size, unsigned int waitall, int nonblock); +void t3_cleanup_ddp(struct toepcb *toep); +void t3_release_ddp_resources(struct toepcb *toep); +void t3_cancel_ddpbuf(struct toepcb *, unsigned int bufidx); +void t3_overlay_ddpbuf(struct toepcb *, unsigned int bufidx, unsigned int tag0, + unsigned int tag1, unsigned int len); +void t3_setup_ddpbufs(struct toepcb *, unsigned int len0, unsigned int offset0, + unsigned int len1, unsigned int offset1, + uint64_t ddp_flags, uint64_t flag_mask, int modulate); +#endif /* T3_DDP_H */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp.h b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h new file mode 100644 index 0000000000000..3042ef00b0f1b --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h @@ -0,0 +1,47 @@ + +/*- + * Copyright (c) 2007, Chelsio Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the Chelsio Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef CXGB_TCP_H_ +#define CXGB_TCP_H_ +#ifdef TCP_USRREQS_OVERLOAD +struct tcpcb *cxgb_tcp_drop(struct tcpcb *tp, int errno); +#else +#define cxgb_tcp_drop tcp_drop +#endif +void cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip); +struct tcpcb *cxgb_tcp_close(struct tcpcb *tp); + +extern struct pr_usrreqs cxgb_tcp_usrreqs; +#ifdef INET6 +extern struct pr_usrreqs cxgb_tcp6_usrreqs; +#endif + +#include <sys/sysctl.h> +SYSCTL_DECL(_net_inet_tcp_cxgb); +#endif /* CXGB_TCP_H_ */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c new file mode 100644 index 0000000000000..b61e1aca2c9ea --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.c @@ -0,0 +1,95 @@ +/*- + * Copyright (c) 2007, Chelsio Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the Chelsio Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * grab bag of accessor routines that will either be moved to netinet + * or removed + */ + + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <net/if_var.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_pcb.h> +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_offload.h> +#include <netinet/tcp_syncache.h> +#include <netinet/toedev.h> + +#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h> + + +/* + * This file contains code as a short-term staging area before it is moved in + * to sys/netinet/tcp_offload.c + */ + +void +sockbuf_lock(struct sockbuf *sb) +{ + + SOCKBUF_LOCK(sb); +} + +void +sockbuf_lock_assert(struct sockbuf *sb) +{ + + SOCKBUF_LOCK_ASSERT(sb); +} + +void +sockbuf_unlock(struct sockbuf *sb) +{ + + SOCKBUF_UNLOCK(sb); +} + +int +sockbuf_sbspace(struct sockbuf *sb) +{ + + return (sbspace(sb)); +} + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h new file mode 100644 index 0000000000000..bf0568c5e7c94 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_offload.h @@ -0,0 +1,155 @@ +/* $FreeBSD$ */ + +#ifndef CXGB_TCP_OFFLOAD_H_ +#define CXGB_TCP_OFFLOAD_H_ + +struct socket; +struct sockbuf; + +void sockbuf_lock(struct sockbuf *); +void sockbuf_lock_assert(struct sockbuf *); +void sockbuf_unlock(struct sockbuf *); +int sockbuf_sbspace(struct sockbuf *); + + +#ifndef _SYS_SOCKETVAR_H_ +#include <sys/selinfo.h> +#include <sys/sx.h> + +/* + * Constants for sb_flags field of struct sockbuf. + */ +#define SB_MAX (256*1024) /* default for max chars in sockbuf */ +/* + * Constants for sb_flags field of struct sockbuf. + */ +#define SB_WAIT 0x04 /* someone is waiting for data/space */ +#define SB_SEL 0x08 /* someone is selecting */ +#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ +#define SB_UPCALL 0x20 /* someone wants an upcall */ +#define SB_NOINTR 0x40 /* operations not interruptible */ +#define SB_AIO 0x80 /* AIO operations queued */ +#define SB_KNOTE 0x100 /* kernel note attached */ +#define SB_NOCOALESCE 0x200 /* don't coalesce new data into existing mbufs */ +#define SB_IN_TOE 0x400 /* socket buffer is in the middle of an operation */ +#define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ + + +struct sockbuf { + struct selinfo sb_sel; /* process selecting read/write */ + struct mtx sb_mtx; /* sockbuf lock */ + struct sx sb_sx; /* prevent I/O interlacing */ + short sb_state; /* (c/d) socket state on sockbuf */ +#define sb_startzero sb_mb + struct mbuf *sb_mb; /* (c/d) the mbuf chain */ + struct mbuf *sb_mbtail; /* (c/d) the last mbuf in the chain */ + struct mbuf *sb_lastrecord; /* (c/d) first mbuf of last + * record in socket buffer */ + struct mbuf *sb_sndptr; /* (c/d) pointer into mbuf chain */ + u_int sb_sndptroff; /* (c/d) byte offset of ptr into chain */ + u_int sb_cc; /* (c/d) actual chars in buffer */ + u_int sb_hiwat; /* (c/d) max actual char count */ + u_int sb_mbcnt; /* (c/d) chars of mbufs used */ + u_int sb_mbmax; /* (c/d) max chars of mbufs to use */ + u_int sb_ctl; /* (c/d) non-data chars in buffer */ + int sb_lowat; /* (c/d) low water mark */ + int sb_timeo; /* (c/d) timeout for read/write */ + short sb_flags; /* (c/d) flags, see below */ +}; + +void sbappend(struct sockbuf *sb, struct mbuf *m); +void sbappend_locked(struct sockbuf *sb, struct mbuf *m); +void sbappendstream(struct sockbuf *sb, struct mbuf *m); +void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m); +void sbdrop(struct sockbuf *sb, int len); +void sbdrop_locked(struct sockbuf *sb, int len); +void sbdroprecord(struct sockbuf *sb); +void sbdroprecord_locked(struct sockbuf *sb); +void sbflush(struct sockbuf *sb); +void sbflush_locked(struct sockbuf *sb); +int sbwait(struct sockbuf *sb); +int sblock(struct sockbuf *, int); +void sbunlock(struct sockbuf *); + + + +/* adjust counters in sb reflecting allocation of m */ +#define sballoc(sb, m) { \ + (sb)->sb_cc += (m)->m_len; \ + if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \ + (sb)->sb_ctl += (m)->m_len; \ + (sb)->sb_mbcnt += MSIZE; \ + if ((m)->m_flags & M_EXT) \ + (sb)->sb_mbcnt += (m)->m_ext.ext_size; \ +} + +/* adjust counters in sb reflecting freeing of m */ +#define sbfree(sb, m) { \ + (sb)->sb_cc -= (m)->m_len; \ + if ((m)->m_type != MT_DATA && (m)->m_type != MT_OOBDATA) \ + (sb)->sb_ctl -= (m)->m_len; \ + (sb)->sb_mbcnt -= MSIZE; \ + if ((m)->m_flags & M_EXT) \ + (sb)->sb_mbcnt -= (m)->m_ext.ext_size; \ + if ((sb)->sb_sndptr == (m)) { \ + (sb)->sb_sndptr = NULL; \ + (sb)->sb_sndptroff = 0; \ + } \ + if ((sb)->sb_sndptroff != 0) \ + (sb)->sb_sndptroff -= (m)->m_len; \ +} + +#define SS_NOFDREF 0x0001 /* no file table ref any more */ +#define SS_ISCONNECTED 0x0002 /* socket connected to a peer */ +#define SS_ISCONNECTING 0x0004 /* in process of connecting to peer */ +#define SS_ISDISCONNECTING 0x0008 /* in process of disconnecting */ +#define SS_NBIO 0x0100 /* non-blocking ops */ +#define SS_ASYNC 0x0200 /* async i/o notify */ +#define SS_ISCONFIRMING 0x0400 /* deciding to accept connection req */ +#define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ +/* + * Protocols can mark a socket as SS_PROTOREF to indicate that, following + * pru_detach, they still want the socket to persist, and will free it + * themselves when they are done. Protocols should only ever call sofree() + * following setting this flag in pru_detach(), and never otherwise, as + * sofree() bypasses socket reference counting. + */ +#define SS_PROTOREF 0x4000 /* strong protocol reference */ + +/* + * Socket state bits now stored in the socket buffer state field. + */ +#define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ +#define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ +#define SBS_RCVATMARK 0x0040 /* at mark on input */ + + + +enum sopt_dir { SOPT_GET, SOPT_SET }; +struct sockopt { + enum sopt_dir sopt_dir; /* is this a get or a set? */ + int sopt_level; /* second arg of [gs]etsockopt */ + int sopt_name; /* third arg of [gs]etsockopt */ + void *sopt_val; /* fourth arg of [gs]etsockopt */ + size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ + struct thread *sopt_td; /* calling thread or null if kernel */ +}; + + +int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); +int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); + + +void soisconnected(struct socket *so); +void soisconnecting(struct socket *so); +void soisdisconnected(struct socket *so); +void soisdisconnecting(struct socket *so); +void socantrcvmore(struct socket *so); +void socantrcvmore_locked(struct socket *so); +void socantsendmore(struct socket *so); +void socantsendmore_locked(struct socket *so); + +#endif /* !NET_CORE */ + + +#endif /* CXGB_TCP_OFFLOAD_H_ */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h new file mode 100644 index 0000000000000..7c4bd0c06c414 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h @@ -0,0 +1,119 @@ +/*- + * Copyright (c) 2007-2008, Chelsio Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the Chelsio Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef CXGB_TOEPCB_H_ +#define CXGB_TOEPCB_H_ +#include <sys/bus.h> +#include <sys/condvar.h> +#include <dev/cxgb/sys/mbufq.h> + +struct toepcb { + struct toedev *tp_toedev; + struct l2t_entry *tp_l2t; + unsigned int tp_tid; + int tp_wr_max; + int tp_wr_avail; + int tp_wr_unacked; + int tp_delack_mode; + int tp_mtu_idx; + int tp_ulp_mode; + int tp_qset_idx; + int tp_mss_clamp; + int tp_qset; + int tp_flags; + int tp_enqueued_bytes; + int tp_page_count; + int tp_state; + + tcp_seq tp_iss; + tcp_seq tp_delack_seq; + tcp_seq tp_rcv_wup; + tcp_seq tp_copied_seq; + uint64_t tp_write_seq; + + volatile int tp_refcount; + vm_page_t *tp_pages; + + struct tcpcb *tp_tp; + struct mbuf *tp_m_last; + bus_dma_tag_t tp_tx_dmat; + bus_dma_tag_t tp_rx_dmat; + bus_dmamap_t tp_dmamap; + + LIST_ENTRY(toepcb) synq_entry; + struct mbuf_head wr_list; + struct mbuf_head out_of_order_queue; + struct ddp_state tp_ddp_state; + struct cv tp_cv; + +}; + +static inline void +reset_wr_list(struct toepcb *toep) +{ + + mbufq_init(&toep->wr_list); +} + +static inline void +purge_wr_queue(struct toepcb *toep) +{ + struct mbuf *m; + + while ((m = mbufq_dequeue(&toep->wr_list)) != NULL) + m_freem(m); +} + +static inline void +enqueue_wr(struct toepcb *toep, struct mbuf *m) +{ + + mbufq_tail(&toep->wr_list, m); +} + +static inline struct mbuf * +peek_wr(const struct toepcb *toep) +{ + + return (mbufq_peek(&toep->wr_list)); +} + +static inline struct mbuf * +dequeue_wr(struct toepcb *toep) +{ + + return (mbufq_dequeue(&toep->wr_list)); +} + +#define wr_queue_walk(toep, m) \ + for (m = peek_wr(toep); m; m = m->m_nextpkt) + + + +#endif + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c new file mode 100644 index 0000000000000..751b1cd0b051e --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c @@ -0,0 +1,1510 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/fcntl.h> +#include <sys/ktr.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/eventhandler.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/condvar.h> +#include <sys/mutex.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/taskqueue.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> + +#include <dev/cxgb/cxgb_osdep.h> +#include <dev/cxgb/sys/mbufq.h> + +#include <netinet/in_pcb.h> + +#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h> +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_offload.h> +#include <netinet/tcp_fsm.h> + +#ifdef CONFIG_DEFINED +#include <cxgb_include.h> +#else +#include <dev/cxgb/cxgb_include.h> +#endif + +#include <net/if_vlan_var.h> +#include <net/route.h> + + +#include <dev/cxgb/t3cdev.h> +#include <dev/cxgb/common/cxgb_firmware_exports.h> +#include <dev/cxgb/common/cxgb_tcb.h> +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/common/cxgb_ctl_defs.h> +#include <dev/cxgb/common/cxgb_t3_cpl.h> +#include <dev/cxgb/cxgb_offload.h> +#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> +#include <dev/cxgb/ulp/tom/cxgb_tom.h> +#include <dev/cxgb/ulp/tom/cxgb_defs.h> +#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> +#include <dev/cxgb/ulp/tom/cxgb_tcp.h> + + + + + +static int activated = 1; +TUNABLE_INT("hw.t3toe.activated", &activated); +SYSCTL_NODE(_hw, OID_AUTO, t3toe, CTLFLAG_RD, 0, "T3 toe driver parameters"); +SYSCTL_UINT(_hw_t3toe, OID_AUTO, activated, CTLFLAG_RDTUN, &activated, 0, + "enable TOE at init time"); + + +TAILQ_HEAD(, adapter) adapter_list; +static struct rwlock adapter_list_lock; + +static TAILQ_HEAD(, tom_data) cxgb_list; +static struct mtx cxgb_list_lock; +static const unsigned int MAX_ATIDS = 64 * 1024; +static const unsigned int ATID_BASE = 0x100000; + +static int t3_toe_attach(struct toedev *dev, const struct offload_id *entry); +static void cxgb_register_listeners(void); +static void t3c_tom_add(struct t3cdev *cdev); + +/* + * Handlers for each CPL opcode + */ +static cxgb_cpl_handler_func tom_cpl_handlers[256]; + + +static eventhandler_tag listen_tag; + +static struct offload_id t3_toe_id_tab[] = { + { TOE_ID_CHELSIO_T3, 0 }, + { TOE_ID_CHELSIO_T3B, 0 }, + { TOE_ID_CHELSIO_T3C, 0 }, + { 0 } +}; + +static struct tom_info t3_tom_info = { + .ti_attach = t3_toe_attach, + .ti_id_table = t3_toe_id_tab, + .ti_name = "Chelsio-T3" +}; + +struct cxgb_client t3c_tom_client = { + .name = "tom_cxgb3", + .add = t3c_tom_add, + .remove = NULL, + .handlers = tom_cpl_handlers, + .redirect = NULL +}; + +/* + * Add an skb to the deferred skb queue for processing from process context. + */ +void +t3_defer_reply(struct mbuf *m, struct toedev *dev, defer_handler_t handler) +{ + struct tom_data *td = TOM_DATA(dev); + + m_set_handler(m, handler); + mtx_lock(&td->deferq.lock); + + mbufq_tail(&td->deferq, m); + if (mbufq_len(&td->deferq) == 1) + taskqueue_enqueue(td->tq, &td->deferq_task); + mtx_lock(&td->deferq.lock); +} + +struct toepcb * +toepcb_alloc(void) +{ + struct toepcb *toep; + + toep = malloc(sizeof(struct toepcb), M_CXGB, M_NOWAIT|M_ZERO); + + if (toep == NULL) + return (NULL); + + toepcb_init(toep); + return (toep); +} + +void +toepcb_init(struct toepcb *toep) +{ + toep->tp_refcount = 1; + cv_init(&toep->tp_cv, "toep cv"); +} + +void +toepcb_hold(struct toepcb *toep) +{ + atomic_add_acq_int(&toep->tp_refcount, 1); +} + +void +toepcb_release(struct toepcb *toep) +{ + if (toep->tp_refcount == 1) { + free(toep, M_CXGB); + return; + } + atomic_add_acq_int(&toep->tp_refcount, -1); +} + + +/* + * Add a T3 offload device to the list of devices we are managing. + */ +static void +t3cdev_add(struct tom_data *t) +{ + mtx_lock(&cxgb_list_lock); + TAILQ_INSERT_TAIL(&cxgb_list, t, entry); + mtx_unlock(&cxgb_list_lock); +} + +static inline int +cdev2type(struct t3cdev *cdev) +{ + int type = 0; + + switch (cdev->type) { + case T3A: + type = TOE_ID_CHELSIO_T3; + break; + case T3B: + type = TOE_ID_CHELSIO_T3B; + break; + case T3C: + type = TOE_ID_CHELSIO_T3C; + break; + } + return (type); +} + +/* + * Allocate and initialize the TID tables. Returns 0 on success. + */ +static int +init_tid_tabs(struct tid_info *t, unsigned int ntids, + unsigned int natids, unsigned int nstids, + unsigned int atid_base, unsigned int stid_base) +{ + unsigned long size = ntids * sizeof(*t->tid_tab) + + natids * sizeof(*t->atid_tab) + nstids * sizeof(*t->stid_tab); + + t->tid_tab = cxgb_alloc_mem(size); + if (!t->tid_tab) + return (ENOMEM); + + t->stid_tab = (union listen_entry *)&t->tid_tab[ntids]; + t->atid_tab = (union active_open_entry *)&t->stid_tab[nstids]; + t->ntids = ntids; + t->nstids = nstids; + t->stid_base = stid_base; + t->sfree = NULL; + t->natids = natids; + t->atid_base = atid_base; + t->afree = NULL; + t->stids_in_use = t->atids_in_use = 0; + atomic_set_int(&t->tids_in_use, 0); + mtx_init(&t->stid_lock, "stid", NULL, MTX_DUPOK|MTX_DEF); + mtx_init(&t->atid_lock, "atid", NULL, MTX_DUPOK|MTX_DEF); + + /* + * Setup the free lists for stid_tab and atid_tab. + */ + if (nstids) { + while (--nstids) + t->stid_tab[nstids - 1].next = &t->stid_tab[nstids]; + t->sfree = t->stid_tab; + } + if (natids) { + while (--natids) + t->atid_tab[natids - 1].next = &t->atid_tab[natids]; + t->afree = t->atid_tab; + } + return 0; +} + +static void +free_tid_maps(struct tid_info *t) +{ + mtx_destroy(&t->stid_lock); + mtx_destroy(&t->atid_lock); + cxgb_free_mem(t->tid_tab); +} + +static inline void +add_adapter(adapter_t *adap) +{ + rw_wlock(&adapter_list_lock); + TAILQ_INSERT_TAIL(&adapter_list, adap, adapter_entry); + rw_wunlock(&adapter_list_lock); +} + +static inline void +remove_adapter(adapter_t *adap) +{ + rw_wlock(&adapter_list_lock); + TAILQ_REMOVE(&adapter_list, adap, adapter_entry); + rw_wunlock(&adapter_list_lock); +} + +/* + * Populate a TID_RELEASE WR. The mbuf must be already propely sized. + */ +static inline void +mk_tid_release(struct mbuf *m, unsigned int tid) +{ + struct cpl_tid_release *req; + + m_set_priority(m, CPL_PRIORITY_SETUP); + req = mtod(m, struct cpl_tid_release *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); +} + +static void +t3_process_tid_release_list(void *data, int pending) +{ + struct mbuf *m; + struct t3cdev *tdev = data; + struct t3c_data *td = T3C_DATA (tdev); + + mtx_lock(&td->tid_release_lock); + while (td->tid_release_list) { + struct toe_tid_entry *p = td->tid_release_list; + + td->tid_release_list = (struct toe_tid_entry *)p->ctx; + mtx_unlock(&td->tid_release_lock); + m = m_get(M_WAIT, MT_DATA); + mk_tid_release(m, p - td->tid_maps.tid_tab); + cxgb_ofld_send(tdev, m); + p->ctx = NULL; + mtx_lock(&td->tid_release_lock); + } + mtx_unlock(&td->tid_release_lock); +} + +int +cxgb_offload_activate(struct adapter *adapter) +{ + struct t3cdev *dev = &adapter->tdev; + int natids, err; + struct t3c_data *t; + struct tid_range stid_range, tid_range; + struct mtutab mtutab; + unsigned int l2t_capacity; + + t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO); + if (!t) + return (ENOMEM); + dev->adapter = adapter; + + err = (EOPNOTSUPP); + if (dev->ctl(dev, GET_TX_MAX_CHUNK, &t->tx_max_chunk) < 0 || + dev->ctl(dev, GET_MAX_OUTSTANDING_WR, &t->max_wrs) < 0 || + dev->ctl(dev, GET_L2T_CAPACITY, &l2t_capacity) < 0 || + dev->ctl(dev, GET_MTUS, &mtutab) < 0 || + dev->ctl(dev, GET_TID_RANGE, &tid_range) < 0 || + dev->ctl(dev, GET_STID_RANGE, &stid_range) < 0) { + device_printf(adapter->dev, "%s: dev->ctl check failed\n", __FUNCTION__); + goto out_free; + } + + err = (ENOMEM); + L2DATA(dev) = t3_init_l2t(l2t_capacity); + if (!L2DATA(dev)) { + device_printf(adapter->dev, "%s: t3_init_l2t failed\n", __FUNCTION__); + goto out_free; + } + natids = min(tid_range.num / 2, MAX_ATIDS); + err = init_tid_tabs(&t->tid_maps, tid_range.num, natids, + stid_range.num, ATID_BASE, stid_range.base); + if (err) { + device_printf(adapter->dev, "%s: init_tid_tabs failed\n", __FUNCTION__); + goto out_free_l2t; + } + + t->mtus = mtutab.mtus; + t->nmtus = mtutab.size; + + TASK_INIT(&t->tid_release_task, 0 /* XXX? */, t3_process_tid_release_list, dev); + mtx_init(&t->tid_release_lock, "tid release", NULL, MTX_DUPOK|MTX_DEF); + t->dev = dev; + + T3C_DATA (dev) = t; + dev->recv = process_rx; + dev->arp_update = t3_l2t_update; + /* Register netevent handler once */ + if (TAILQ_EMPTY(&adapter_list)) { +#if defined(CONFIG_CHELSIO_T3_MODULE) + if (prepare_arp_with_t3core()) + log(LOG_ERR, "Unable to set offload capabilities\n"); +#endif + } + CTR1(KTR_CXGB, "adding adapter %p", adapter); + add_adapter(adapter); + device_printf(adapter->dev, "offload started\n"); + adapter->flags |= CXGB_OFLD_INIT; + return (0); + +out_free_l2t: + t3_free_l2t(L2DATA(dev)); + L2DATA(dev) = NULL; +out_free: + free(t, M_CXGB); + return (err); +} + +void +cxgb_offload_deactivate(struct adapter *adapter) +{ + struct t3cdev *tdev = &adapter->tdev; + struct t3c_data *t = T3C_DATA(tdev); + + printf("removing adapter %p\n", adapter); + remove_adapter(adapter); + if (TAILQ_EMPTY(&adapter_list)) { +#if defined(CONFIG_CHELSIO_T3_MODULE) + restore_arp_sans_t3core(); +#endif + } + free_tid_maps(&t->tid_maps); + T3C_DATA(tdev) = NULL; + t3_free_l2t(L2DATA(tdev)); + L2DATA(tdev) = NULL; + mtx_destroy(&t->tid_release_lock); + free(t, M_CXGB); +} + +/* + * Sends an sk_buff to a T3C driver after dealing with any active network taps. + */ +int +cxgb_ofld_send(struct t3cdev *dev, struct mbuf *m) +{ + int r; + + r = dev->send(dev, m); + return r; +} + +static struct ifnet * +get_iff_from_mac(adapter_t *adapter, const uint8_t *mac, unsigned int vlan) +{ + int i; + + for_each_port(adapter, i) { +#ifdef notyet + const struct vlan_group *grp; +#endif + const struct port_info *p = &adapter->port[i]; + struct ifnet *ifp = p->ifp; + + if (!memcmp(p->hw_addr, mac, ETHER_ADDR_LEN)) { +#ifdef notyet + + if (vlan && vlan != EVL_VLID_MASK) { + grp = p->vlan_grp; + dev = grp ? grp->vlan_devices[vlan] : NULL; + } else + while (dev->master) + dev = dev->master; +#endif + return (ifp); + } + } + return (NULL); +} + +static inline void +failover_fixup(adapter_t *adapter, int port) +{ + if (adapter->params.rev == 0) { + struct ifnet *ifp = adapter->port[port].ifp; + struct cmac *mac = &adapter->port[port].mac; + if (!(ifp->if_flags & IFF_UP)) { + /* Failover triggered by the interface ifdown */ + t3_write_reg(adapter, A_XGM_TX_CTRL + mac->offset, + F_TXEN); + t3_read_reg(adapter, A_XGM_TX_CTRL + mac->offset); + } else { + /* Failover triggered by the interface link down */ + t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0); + t3_read_reg(adapter, A_XGM_RX_CTRL + mac->offset); + t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, + F_RXEN); + } + } +} + +static int +cxgb_ulp_iscsi_ctl(adapter_t *adapter, unsigned int req, void *data) +{ + int ret = 0; + struct ulp_iscsi_info *uiip = data; + + switch (req) { + case ULP_ISCSI_GET_PARAMS: + uiip->llimit = t3_read_reg(adapter, A_ULPRX_ISCSI_LLIMIT); + uiip->ulimit = t3_read_reg(adapter, A_ULPRX_ISCSI_ULIMIT); + uiip->tagmask = t3_read_reg(adapter, A_ULPRX_ISCSI_TAGMASK); + /* + * On tx, the iscsi pdu has to be <= tx page size and has to + * fit into the Tx PM FIFO. + */ + uiip->max_txsz = min(adapter->params.tp.tx_pg_size, + t3_read_reg(adapter, A_PM1_TX_CFG) >> 17); + /* on rx, the iscsi pdu has to be < rx page size and the + whole pdu + cpl headers has to fit into one sge buffer */ + /* also check the max rx data length programmed in TP */ + uiip->max_rxsz = min(uiip->max_rxsz, + ((t3_read_reg(adapter, A_TP_PARA_REG2)) + >> S_MAXRXDATA) & M_MAXRXDATA); + break; + case ULP_ISCSI_SET_PARAMS: + t3_write_reg(adapter, A_ULPRX_ISCSI_TAGMASK, uiip->tagmask); + break; + default: + ret = (EOPNOTSUPP); + } + return ret; +} + +/* Response queue used for RDMA events. */ +#define ASYNC_NOTIF_RSPQ 0 + +static int +cxgb_rdma_ctl(adapter_t *adapter, unsigned int req, void *data) +{ + int ret = 0; + + switch (req) { + case RDMA_GET_PARAMS: { + struct rdma_info *req = data; + + req->udbell_physbase = rman_get_start(adapter->udbs_res); + req->udbell_len = rman_get_size(adapter->udbs_res); + req->tpt_base = t3_read_reg(adapter, A_ULPTX_TPT_LLIMIT); + req->tpt_top = t3_read_reg(adapter, A_ULPTX_TPT_ULIMIT); + req->pbl_base = t3_read_reg(adapter, A_ULPTX_PBL_LLIMIT); + req->pbl_top = t3_read_reg(adapter, A_ULPTX_PBL_ULIMIT); + req->rqt_base = t3_read_reg(adapter, A_ULPRX_RQ_LLIMIT); + req->rqt_top = t3_read_reg(adapter, A_ULPRX_RQ_ULIMIT); + req->kdb_addr = (void *)((unsigned long)rman_get_virtual(adapter->regs_res) + A_SG_KDOORBELL); break; + } + case RDMA_CQ_OP: { + struct rdma_cq_op *req = data; + + /* may be called in any context */ + mtx_lock_spin(&adapter->sge.reg_lock); + ret = t3_sge_cqcntxt_op(adapter, req->id, req->op, + req->credits); + mtx_unlock_spin(&adapter->sge.reg_lock); + break; + } + case RDMA_GET_MEM: { + struct ch_mem_range *t = data; + struct mc7 *mem; + + if ((t->addr & 7) || (t->len & 7)) + return (EINVAL); + if (t->mem_id == MEM_CM) + mem = &adapter->cm; + else if (t->mem_id == MEM_PMRX) + mem = &adapter->pmrx; + else if (t->mem_id == MEM_PMTX) + mem = &adapter->pmtx; + else + return (EINVAL); + + ret = t3_mc7_bd_read(mem, t->addr/8, t->len/8, (u64 *)t->buf); + if (ret) + return (ret); + break; + } + case RDMA_CQ_SETUP: { + struct rdma_cq_setup *req = data; + + mtx_lock_spin(&adapter->sge.reg_lock); + ret = t3_sge_init_cqcntxt(adapter, req->id, req->base_addr, + req->size, ASYNC_NOTIF_RSPQ, + req->ovfl_mode, req->credits, + req->credit_thres); + mtx_unlock_spin(&adapter->sge.reg_lock); + break; + } + case RDMA_CQ_DISABLE: + mtx_lock_spin(&adapter->sge.reg_lock); + ret = t3_sge_disable_cqcntxt(adapter, *(unsigned int *)data); + mtx_unlock_spin(&adapter->sge.reg_lock); + break; + case RDMA_CTRL_QP_SETUP: { + struct rdma_ctrlqp_setup *req = data; + + mtx_lock_spin(&adapter->sge.reg_lock); + ret = t3_sge_init_ecntxt(adapter, FW_RI_SGEEC_START, 0, + SGE_CNTXT_RDMA, ASYNC_NOTIF_RSPQ, + req->base_addr, req->size, + FW_RI_TID_START, 1, 0); + mtx_unlock_spin(&adapter->sge.reg_lock); + break; + } + default: + ret = EOPNOTSUPP; + } + return (ret); +} + +static int +cxgb_offload_ctl(struct t3cdev *tdev, unsigned int req, void *data) +{ + struct adapter *adapter = tdev2adap(tdev); + struct tid_range *tid; + struct mtutab *mtup; + struct iff_mac *iffmacp; + struct ddp_params *ddpp; + struct adap_ports *ports; + struct ofld_page_info *rx_page_info; + struct tp_params *tp = &adapter->params.tp; + int port; + + switch (req) { + case GET_MAX_OUTSTANDING_WR: + *(unsigned int *)data = FW_WR_NUM; + break; + case GET_WR_LEN: + *(unsigned int *)data = WR_FLITS; + break; + case GET_TX_MAX_CHUNK: + *(unsigned int *)data = 1 << 20; /* 1MB */ + break; + case GET_TID_RANGE: + tid = data; + tid->num = t3_mc5_size(&adapter->mc5) - + adapter->params.mc5.nroutes - + adapter->params.mc5.nfilters - + adapter->params.mc5.nservers; + tid->base = 0; + break; + case GET_STID_RANGE: + tid = data; + tid->num = adapter->params.mc5.nservers; + tid->base = t3_mc5_size(&adapter->mc5) - tid->num - + adapter->params.mc5.nfilters - + adapter->params.mc5.nroutes; + break; + case GET_L2T_CAPACITY: + *(unsigned int *)data = 2048; + break; + case GET_MTUS: + mtup = data; + mtup->size = NMTUS; + mtup->mtus = adapter->params.mtus; + break; + case GET_IFF_FROM_MAC: + iffmacp = data; + iffmacp->dev = get_iff_from_mac(adapter, iffmacp->mac_addr, + iffmacp->vlan_tag & EVL_VLID_MASK); + break; + case GET_DDP_PARAMS: + ddpp = data; + ddpp->llimit = t3_read_reg(adapter, A_ULPRX_TDDP_LLIMIT); + ddpp->ulimit = t3_read_reg(adapter, A_ULPRX_TDDP_ULIMIT); + ddpp->tag_mask = t3_read_reg(adapter, A_ULPRX_TDDP_TAGMASK); + break; + case GET_PORTS: + ports = data; + ports->nports = adapter->params.nports; + for_each_port(adapter, port) + ports->lldevs[port] = adapter->port[port].ifp; + break; + case FAILOVER: + port = *(int *)data; + t3_port_failover(adapter, port); + failover_fixup(adapter, port); + break; + case FAILOVER_DONE: + port = *(int *)data; + t3_failover_done(adapter, port); + break; + case FAILOVER_CLEAR: + t3_failover_clear(adapter); + break; + case GET_RX_PAGE_INFO: + rx_page_info = data; + rx_page_info->page_size = tp->rx_pg_size; + rx_page_info->num = tp->rx_num_pgs; + break; + case ULP_ISCSI_GET_PARAMS: + case ULP_ISCSI_SET_PARAMS: + if (!offload_running(adapter)) + return (EAGAIN); + return cxgb_ulp_iscsi_ctl(adapter, req, data); + case RDMA_GET_PARAMS: + case RDMA_CQ_OP: + case RDMA_CQ_SETUP: + case RDMA_CQ_DISABLE: + case RDMA_CTRL_QP_SETUP: + case RDMA_GET_MEM: + if (!offload_running(adapter)) + return (EAGAIN); + return cxgb_rdma_ctl(adapter, req, data); + default: + return (EOPNOTSUPP); + } + return 0; +} + +/* + * Allocate a TOM data structure, + * initialize its cpl_handlers + * and register it as a T3C client + */ +static void +t3c_tom_add(struct t3cdev *cdev) +{ + int i; + unsigned int wr_len; + struct tom_data *t; + struct toedev *tdev; + struct adap_ports *port_info; + + t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO); + if (t == NULL) + return; + + cdev->send = t3_offload_tx; + cdev->ctl = cxgb_offload_ctl; + + if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0) + goto out_free_tom; + + port_info = malloc(sizeof(*port_info), M_CXGB, M_NOWAIT|M_ZERO); + if (!port_info) + goto out_free_tom; + + if (cdev->ctl(cdev, GET_PORTS, port_info) < 0) + goto out_free_all; + + t3_init_wr_tab(wr_len); + t->cdev = cdev; + t->client = &t3c_tom_client; + + /* Register TCP offload device */ + tdev = &t->tdev; + tdev->tod_ttid = cdev2type(cdev); + tdev->tod_lldev = cdev->lldev; + + if (register_toedev(tdev, "toe%d")) { + printf("unable to register offload device"); + goto out_free_all; + } + TOM_DATA(tdev) = t; + + for (i = 0; i < port_info->nports; i++) { + struct ifnet *ifp = port_info->lldevs[i]; + TOEDEV(ifp) = tdev; + + CTR1(KTR_TOM, "enabling toe on %p", ifp); + ifp->if_capabilities |= IFCAP_TOE4; + ifp->if_capenable |= IFCAP_TOE4; + } + t->ports = port_info; + + /* Add device to the list of offload devices */ + t3cdev_add(t); + + /* Activate TCP offload device */ + cxgb_offload_activate(TOM_DATA(tdev)->cdev->adapter); + + activate_offload(tdev); + cxgb_register_listeners(); + return; + +out_free_all: + printf("out_free_all fail\n"); + free(port_info, M_CXGB); +out_free_tom: + printf("out_free_tom fail\n"); + free(t, M_CXGB); + return; +} + + + +static int +do_act_open_rpl(struct t3cdev *dev, struct mbuf *m) +{ + struct cpl_act_open_rpl *rpl = cplhdr(m); + unsigned int atid = G_TID(ntohl(rpl->atid)); + struct toe_tid_entry *toe_tid; + + toe_tid = lookup_atid(&(T3C_DATA (dev))->tid_maps, atid); + if (toe_tid->ctx && toe_tid->client && toe_tid->client->handlers && + toe_tid->client->handlers[CPL_ACT_OPEN_RPL]) { + return toe_tid->client->handlers[CPL_ACT_OPEN_RPL] (dev, m, + toe_tid->ctx); + } else { + log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", + dev->name, CPL_ACT_OPEN_RPL); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +static int +do_stid_rpl(struct t3cdev *dev, struct mbuf *m) +{ + union opcode_tid *p = cplhdr(m); + unsigned int stid = G_TID(ntohl(p->opcode_tid)); + struct toe_tid_entry *toe_tid; + + toe_tid = lookup_stid(&(T3C_DATA (dev))->tid_maps, stid); + if (toe_tid->ctx && toe_tid->client->handlers && + toe_tid->client->handlers[p->opcode]) { + return toe_tid->client->handlers[p->opcode] (dev, m, toe_tid->ctx); + } else { + log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", + dev->name, p->opcode); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +static int +do_hwtid_rpl(struct t3cdev *dev, struct mbuf *m) +{ + union opcode_tid *p = cplhdr(m); + unsigned int hwtid; + struct toe_tid_entry *toe_tid; + + DPRINTF("do_hwtid_rpl opcode=0x%x\n", p->opcode); + hwtid = G_TID(ntohl(p->opcode_tid)); + + toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid); + if (toe_tid->ctx && toe_tid->client->handlers && + toe_tid->client->handlers[p->opcode]) { + return toe_tid->client->handlers[p->opcode] + (dev, m, toe_tid->ctx); + } else { + log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", + dev->name, p->opcode); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +static int +do_cr(struct t3cdev *dev, struct mbuf *m) +{ + struct cpl_pass_accept_req *req = cplhdr(m); + unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); + struct toe_tid_entry *toe_tid; + + toe_tid = lookup_stid(&(T3C_DATA (dev))->tid_maps, stid); + if (toe_tid->ctx && toe_tid->client->handlers && + toe_tid->client->handlers[CPL_PASS_ACCEPT_REQ]) { + return toe_tid->client->handlers[CPL_PASS_ACCEPT_REQ] + (dev, m, toe_tid->ctx); + } else { + log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", + dev->name, CPL_PASS_ACCEPT_REQ); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + +static int +do_abort_req_rss(struct t3cdev *dev, struct mbuf *m) +{ + union opcode_tid *p = cplhdr(m); + unsigned int hwtid = G_TID(ntohl(p->opcode_tid)); + struct toe_tid_entry *toe_tid; + + toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid); + if (toe_tid->ctx && toe_tid->client->handlers && + toe_tid->client->handlers[p->opcode]) { + return toe_tid->client->handlers[p->opcode] + (dev, m, toe_tid->ctx); + } else { + struct cpl_abort_req_rss *req = cplhdr(m); + struct cpl_abort_rpl *rpl; + + struct mbuf *m = m_get(M_NOWAIT, MT_DATA); + if (!m) { + log(LOG_NOTICE, "do_abort_req_rss: couldn't get mbuf!\n"); + goto out; + } + + m_set_priority(m, CPL_PRIORITY_DATA); + rpl = cplhdr(m); + rpl->wr.wr_hi = + htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(GET_TID(req))); + OPCODE_TID(rpl) = + htonl(MK_OPCODE_TID(CPL_ABORT_RPL, GET_TID(req))); + rpl->cmd = req->status; + cxgb_ofld_send(dev, m); + out: + return (CPL_RET_BUF_DONE); + } +} + +static int +do_act_establish(struct t3cdev *dev, struct mbuf *m) +{ + struct cpl_act_establish *req; + unsigned int atid; + struct toe_tid_entry *toe_tid; + + req = cplhdr(m); + atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); + toe_tid = lookup_atid(&(T3C_DATA (dev))->tid_maps, atid); + if (toe_tid && toe_tid->ctx && toe_tid->client->handlers && + toe_tid->client->handlers[CPL_ACT_ESTABLISH]) { + + return toe_tid->client->handlers[CPL_ACT_ESTABLISH] + (dev, m, toe_tid->ctx); + } else { + + log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", + dev->name, CPL_PASS_ACCEPT_REQ); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } +} + + +static int +do_term(struct t3cdev *dev, struct mbuf *m) +{ + unsigned int hwtid = ntohl(m_get_priority(m)) >> 8 & 0xfffff; + unsigned int opcode = G_OPCODE(ntohl(m->m_pkthdr.csum_data)); + struct toe_tid_entry *toe_tid; + + toe_tid = lookup_tid(&(T3C_DATA (dev))->tid_maps, hwtid); + if (toe_tid && toe_tid->ctx && toe_tid->client->handlers && + toe_tid->client->handlers[opcode]) { + return toe_tid->client->handlers[opcode](dev, m, toe_tid->ctx); + } else { + log(LOG_ERR, "%s: received clientless CPL command 0x%x\n", + dev->name, opcode); + return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG; + } + return (0); +} + +/* + * Process a received packet with an unknown/unexpected CPL opcode. + */ +static int +do_bad_cpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + log(LOG_ERR, "%s: received bad CPL command %u\n", cdev->name, + 0xFF & *mtod(m, unsigned int *)); + return (CPL_RET_BUF_DONE | CPL_RET_BAD_MSG); +} + +/* + * Add a new handler to the CPL dispatch table. A NULL handler may be supplied + * to unregister an existing handler. + */ +void +t3tom_register_cpl_handler(unsigned int opcode, cxgb_cpl_handler_func h) +{ + if (opcode < UCHAR_MAX) + tom_cpl_handlers[opcode] = h ? h : do_bad_cpl; + else + log(LOG_ERR, "Chelsio T3 TOM: handler registration for " + "opcode %u failed\n", opcode); +} + +/* + * Make a preliminary determination if a connection can be offloaded. It's OK + * to fail the offload later if we say we can offload here. For now this + * always accepts the offload request unless there are IP options. + */ +static int +can_offload(struct toedev *dev, struct socket *so) +{ + struct tom_data *tomd = TOM_DATA(dev); + struct t3cdev *cdev = T3CDEV(dev->tod_lldev); + struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; + + return so_sotoinpcb(so)->inp_depend4.inp4_options == NULL && + tomd->conf.activated && + (tomd->conf.max_conn < 0 || + atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn); +} + +static int +tom_ctl(struct toedev *dev, unsigned int req, void *data) +{ + struct tom_data *t = TOM_DATA(dev); + struct t3cdev *cdev = t->cdev; + + if (cdev->ctl) + return cdev->ctl(cdev, req, data); + + return (EOPNOTSUPP); +} + +/* + * Free an active-open TID. + */ +void * +cxgb_free_atid(struct t3cdev *tdev, int atid) +{ + struct tid_info *t = &(T3C_DATA(tdev))->tid_maps; + union active_open_entry *p = atid2entry(t, atid); + void *ctx = p->toe_tid.ctx; + + mtx_lock(&t->atid_lock); + p->next = t->afree; + t->afree = p; + t->atids_in_use--; + mtx_unlock(&t->atid_lock); + + return ctx; +} + +/* + * Free a server TID and return it to the free pool. + */ +void +cxgb_free_stid(struct t3cdev *tdev, int stid) +{ + struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; + union listen_entry *p = stid2entry(t, stid); + + mtx_lock(&t->stid_lock); + p->next = t->sfree; + t->sfree = p; + t->stids_in_use--; + mtx_unlock(&t->stid_lock); +} + +/* + * Free a server TID and return it to the free pool. + */ +void * +cxgb_get_lctx(struct t3cdev *tdev, int stid) +{ + struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; + union listen_entry *p = stid2entry(t, stid); + + return (p->toe_tid.ctx); +} + +void +cxgb_insert_tid(struct t3cdev *tdev, struct cxgb_client *client, + void *ctx, unsigned int tid) +{ + struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; + + t->tid_tab[tid].client = client; + t->tid_tab[tid].ctx = ctx; + atomic_add_int(&t->tids_in_use, 1); +} + +/* use ctx as a next pointer in the tid release list */ +void +cxgb_queue_tid_release(struct t3cdev *tdev, unsigned int tid) +{ + struct t3c_data *td = T3C_DATA (tdev); + struct toe_tid_entry *p = &td->tid_maps.tid_tab[tid]; + + CTR0(KTR_TOM, "queuing tid release\n"); + + mtx_lock(&td->tid_release_lock); + p->ctx = td->tid_release_list; + td->tid_release_list = p; + + if (!p->ctx) + taskqueue_enqueue(tdev->adapter->tq, &td->tid_release_task); + + mtx_unlock(&td->tid_release_lock); +} + +/* + * Remove a tid from the TID table. A client may defer processing its last + * CPL message if it is locked at the time it arrives, and while the message + * sits in the client's backlog the TID may be reused for another connection. + * To handle this we atomically switch the TID association if it still points + * to the original client context. + */ +void +cxgb_remove_tid(struct t3cdev *tdev, void *ctx, unsigned int tid) +{ + struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; + + if (tid >= t->ntids) + panic("tid=%d >= t->ntids=%d", tid, t->ntids); + + if (tdev->type == T3A) + atomic_cmpset_ptr((uintptr_t *)&t->tid_tab[tid].ctx, (long)NULL, (long)ctx); + else { + struct mbuf *m; + + m = m_get(M_NOWAIT, MT_DATA); + if (__predict_true(m != NULL)) { + mk_tid_release(m, tid); + CTR1(KTR_CXGB, "releasing tid=%u", tid); + + cxgb_ofld_send(tdev, m); + t->tid_tab[tid].ctx = NULL; + } else + cxgb_queue_tid_release(tdev, tid); + } + atomic_add_int(&t->tids_in_use, -1); +} + +int +cxgb_alloc_atid(struct t3cdev *tdev, struct cxgb_client *client, + void *ctx) +{ + int atid = -1; + struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; + + mtx_lock(&t->atid_lock); + if (t->afree) { + union active_open_entry *p = t->afree; + + atid = (p - t->atid_tab) + t->atid_base; + t->afree = p->next; + p->toe_tid.ctx = ctx; + p->toe_tid.client = client; + t->atids_in_use++; + } + mtx_unlock(&t->atid_lock); + return atid; +} + +int +cxgb_alloc_stid(struct t3cdev *tdev, struct cxgb_client *client, + void *ctx) +{ + int stid = -1; + struct tid_info *t = &(T3C_DATA (tdev))->tid_maps; + + mtx_lock(&t->stid_lock); + if (t->sfree) { + union listen_entry *p = t->sfree; + + stid = (p - t->stid_tab) + t->stid_base; + t->sfree = p->next; + p->toe_tid.ctx = ctx; + p->toe_tid.client = client; + t->stids_in_use++; + } + mtx_unlock(&t->stid_lock); + return stid; +} + + +static int +is_offloading(struct ifnet *ifp) +{ + struct adapter *adapter; + int port; + + rw_rlock(&adapter_list_lock); + TAILQ_FOREACH(adapter, &adapter_list, adapter_entry) { + for_each_port(adapter, port) { + if (ifp == adapter->port[port].ifp) { + rw_runlock(&adapter_list_lock); + return 1; + } + } + } + rw_runlock(&adapter_list_lock); + return 0; +} + + +static void +cxgb_arp_update_event(void *unused, struct rtentry *rt0, + uint8_t *enaddr, struct sockaddr *sa) +{ + + if (!is_offloading(rt0->rt_ifp)) + return; + + RT_ADDREF(rt0); + RT_UNLOCK(rt0); + cxgb_neigh_update(rt0, enaddr, sa); + RT_LOCK(rt0); + RT_REMREF(rt0); +} + +static void +cxgb_redirect_event(void *unused, int event, struct rtentry *rt0, + struct rtentry *rt1, struct sockaddr *sa) +{ + /* + * ignore events on non-offloaded interfaces + */ + if (!is_offloading(rt0->rt_ifp)) + return; + + /* + * Cannot redirect to non-offload device. + */ + if (!is_offloading(rt1->rt_ifp)) { + log(LOG_WARNING, "%s: Redirect to non-offload" + "device ignored.\n", __FUNCTION__); + return; + } + + /* + * avoid LORs by dropping the route lock but keeping a reference + * + */ + RT_ADDREF(rt0); + RT_UNLOCK(rt0); + RT_ADDREF(rt1); + RT_UNLOCK(rt1); + + cxgb_redirect(rt0, rt1, sa); + cxgb_neigh_update(rt1, NULL, sa); + + RT_LOCK(rt0); + RT_REMREF(rt0); + RT_LOCK(rt1); + RT_REMREF(rt1); +} + +void +cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa) +{ + + if (rt->rt_ifp && is_offloading(rt->rt_ifp) && (rt->rt_ifp->if_flags & IFCAP_TOE)) { + struct t3cdev *tdev = T3CDEV(rt->rt_ifp); + + PANIC_IF(!tdev); + t3_l2t_update(tdev, rt, enaddr, sa); + } +} + +static void +set_l2t_ix(struct t3cdev *tdev, u32 tid, struct l2t_entry *e) +{ + struct mbuf *m; + struct cpl_set_tcb_field *req; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (!m) { + log(LOG_ERR, "%s: cannot allocate mbuf!\n", __FUNCTION__); + return; + } + + m_set_priority(m, CPL_PRIORITY_CONTROL); + req = mtod(m, struct cpl_set_tcb_field *); + m->m_pkthdr.len = m->m_len = sizeof(*req); + + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_L2T_IX); + req->mask = htobe64(V_TCB_L2T_IX(M_TCB_L2T_IX)); + req->val = htobe64(V_TCB_L2T_IX(e->idx)); + tdev->send(tdev, m); +} + +void +cxgb_redirect(struct rtentry *old, struct rtentry *new, struct sockaddr *sa) +{ + struct ifnet *olddev, *newdev; + struct tid_info *ti; + struct t3cdev *tdev; + u32 tid; + int update_tcb; + struct l2t_entry *e; + struct toe_tid_entry *te; + + olddev = old->rt_ifp; + newdev = new->rt_ifp; + if (!is_offloading(olddev)) + return; + if (!is_offloading(newdev)) { + log(LOG_WARNING, "%s: Redirect to non-offload" + "device ignored.\n", __FUNCTION__); + return; + } + tdev = T3CDEV(olddev); + PANIC_IF(!tdev); + if (tdev != T3CDEV(newdev)) { + log(LOG_WARNING, "%s: Redirect to different " + "offload device ignored.\n", __FUNCTION__); + return; + } + + /* Add new L2T entry */ + e = t3_l2t_get(tdev, new, new->rt_ifp, sa); + if (!e) { + log(LOG_ERR, "%s: couldn't allocate new l2t entry!\n", + __FUNCTION__); + return; + } + + /* Walk tid table and notify clients of dst change. */ + ti = &(T3C_DATA (tdev))->tid_maps; + for (tid=0; tid < ti->ntids; tid++) { + te = lookup_tid(ti, tid); + PANIC_IF(!te); + if (te->ctx && te->client && te->client->redirect) { + update_tcb = te->client->redirect(te->ctx, old, new, + e); + if (update_tcb) { + l2t_hold(L2DATA(tdev), e); + set_l2t_ix(tdev, tid, e); + } + } + } + l2t_release(L2DATA(tdev), e); +} + +/* + * Initialize the CPL dispatch table. + */ +static void +init_cpl_handlers(void) +{ + int i; + + for (i = 0; i < 256; ++i) + tom_cpl_handlers[i] = do_bad_cpl; + + t3_init_listen_cpl_handlers(); +} + +static int +t3_toe_attach(struct toedev *dev, const struct offload_id *entry) +{ + struct tom_data *t = TOM_DATA(dev); + struct t3cdev *cdev = t->cdev; + struct ddp_params ddp; + struct ofld_page_info rx_page_info; + int err; + + t3_init_tunables(t); + mtx_init(&t->listen_lock, "tom data listeners", NULL, MTX_DEF); + CTR2(KTR_TOM, "t3_toe_attach dev=%p entry=%p", dev, entry); + /* Adjust TOE activation for this module */ + t->conf.activated = activated; + + dev->tod_can_offload = can_offload; + dev->tod_connect = t3_connect; + dev->tod_ctl = tom_ctl; +#if 0 + dev->tod_failover = t3_failover; +#endif + err = cdev->ctl(cdev, GET_DDP_PARAMS, &ddp); + if (err) + return err; + + err = cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info); + if (err) + return err; + + t->ddp_llimit = ddp.llimit; + t->ddp_ulimit = ddp.ulimit; + t->pdev = ddp.pdev; + t->rx_page_size = rx_page_info.page_size; + /* OK if this fails, we just can't do DDP */ + t->nppods = (ddp.ulimit + 1 - ddp.llimit) / PPOD_SIZE; + t->ppod_map = malloc(t->nppods, M_DEVBUF, M_NOWAIT|M_ZERO); + + mtx_init(&t->ppod_map_lock, "ppod map", NULL, MTX_DEF); + + + t3_sysctl_register(cdev->adapter, &t->conf); + return (0); +} + +static void +cxgb_toe_listen_start(void *unused, struct tcpcb *tp) +{ + struct socket *so = inp_inpcbtosocket(tp->t_inpcb); + struct tom_data *p; + + mtx_lock(&cxgb_list_lock); + TAILQ_FOREACH(p, &cxgb_list, entry) { + t3_listen_start(&p->tdev, so, p->cdev); + } + mtx_unlock(&cxgb_list_lock); +} + +static void +cxgb_toe_listen_stop(void *unused, struct tcpcb *tp) +{ + struct socket *so = inp_inpcbtosocket(tp->t_inpcb); + struct tom_data *p; + + mtx_lock(&cxgb_list_lock); + TAILQ_FOREACH(p, &cxgb_list, entry) { + if (tp->t_state == TCPS_LISTEN) + t3_listen_stop(&p->tdev, so, p->cdev); + } + mtx_unlock(&cxgb_list_lock); +} + +static void +cxgb_toe_listen_start_handler(struct inpcb *inp, void *arg) +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp->t_state == TCPS_LISTEN) + cxgb_toe_listen_start(NULL, tp); +} + +static void +cxgb_register_listeners(void) +{ + + inp_apply_all(cxgb_toe_listen_start_handler, NULL); +} + +static int +t3_tom_init(void) +{ + init_cpl_handlers(); + if (t3_init_cpl_io() < 0) { + log(LOG_ERR, + "Unable to initialize cpl io ops\n"); + return -1; + } + t3_init_socket_ops(); + + /* Register with the TOE device layer. */ + + if (register_tom(&t3_tom_info) != 0) { + log(LOG_ERR, + "Unable to register Chelsio T3 TCP offload module.\n"); + return -1; + } + + rw_init(&adapter_list_lock, "ofld adap list"); + TAILQ_INIT(&adapter_list); + EVENTHANDLER_REGISTER(route_arp_update_event, cxgb_arp_update_event, + NULL, EVENTHANDLER_PRI_ANY); + EVENTHANDLER_REGISTER(route_redirect_event, cxgb_redirect_event, + NULL, EVENTHANDLER_PRI_ANY); + + mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF); + listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_start, + cxgb_toe_listen_start, NULL, EVENTHANDLER_PRI_ANY); + listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_stop, + cxgb_toe_listen_stop, NULL, EVENTHANDLER_PRI_ANY); + TAILQ_INIT(&cxgb_list); + + + + t3_register_cpl_handler(CPL_PASS_OPEN_RPL, do_stid_rpl); + t3_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_stid_rpl); + t3_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_cr); + t3_register_cpl_handler(CPL_PASS_ESTABLISH, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ABORT_RPL_RSS, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ABORT_RPL, do_hwtid_rpl); + t3_register_cpl_handler(CPL_RX_URG_NOTIFY, do_hwtid_rpl); + t3_register_cpl_handler(CPL_RX_DATA, do_hwtid_rpl); + t3_register_cpl_handler(CPL_TX_DATA_ACK, do_hwtid_rpl); + t3_register_cpl_handler(CPL_TX_DMA_ACK, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); + t3_register_cpl_handler(CPL_PEER_CLOSE, do_hwtid_rpl); + t3_register_cpl_handler(CPL_CLOSE_CON_RPL, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req_rss); + t3_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); + t3_register_cpl_handler(CPL_RDMA_TERMINATE, do_term); + t3_register_cpl_handler(CPL_RDMA_EC_STATUS, do_hwtid_rpl); + t3_register_cpl_handler(CPL_RX_DATA_DDP, do_hwtid_rpl); + t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl); + t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl); + t3_register_cpl_handler(CPL_GET_TCB_RPL, do_hwtid_rpl); + t3_register_cpl_handler(CPL_SET_TCB_RPL, do_hwtid_rpl); + + /* Register to offloading devices */ + cxgb_register_client(&t3c_tom_client); + + return (0); +} + +static int +t3_tom_load(module_t mod, int cmd, void *arg) +{ + int err = 0; + + switch (cmd) { + case MOD_LOAD: + t3_tom_init(); + break; + case MOD_QUIESCE: + break; + case MOD_UNLOAD: + printf("uhm, ... unloading isn't really supported for toe\n"); + break; + case MOD_SHUTDOWN: + break; + default: + err = EOPNOTSUPP; + break; + } + + return (err); +} + +static moduledata_t mod_data= { + "t3_tom", + t3_tom_load, + 0 +}; +MODULE_VERSION(t3_tom, 1); +MODULE_DEPEND(t3_tom, toecore, 1, 1, 1); +MODULE_DEPEND(t3_tom, if_cxgb, 1, 1, 1); +DECLARE_MODULE(t3_tom, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.h b/sys/dev/cxgb/ulp/tom/cxgb_tom.h new file mode 100644 index 0000000000000..bcda2c3c57aaa --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.h @@ -0,0 +1,159 @@ + +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ +#ifndef CXGB_TOM_H_ +#define CXGB_TOM_H_ +#include <sys/protosw.h> + +#define LISTEN_INFO_HASH_SIZE 32 + +struct listen_info { + struct listen_info *next; /* Link to next entry */ + struct socket *so; /* The listening socket */ + unsigned int stid; /* The server TID */ +}; + + +/* + * TOM tunable parameters. They can be manipulated through sysctl(2) or /proc. + */ +struct tom_tunables { + int max_host_sndbuf; // max host RAM consumed by a sndbuf + int tx_hold_thres; // push/pull threshold for non-full TX sk_buffs + int max_wrs; // max # of outstanding WRs per connection + int rx_credit_thres; // min # of RX credits needed for RX_DATA_ACK + int cong_alg; // Congestion control algorithm + int mss; // max TX_DATA WR payload size + int delack; // delayed ACK control + int max_conn; // maximum number of offloaded connections + int soft_backlog_limit; // whether the listen backlog limit is soft + int ddp; // whether to put new connections in DDP mode + int ddp_thres; // min recvmsg size before activating DDP + int ddp_copy_limit; // capacity of kernel DDP buffer + int ddp_push_wait; // whether blocking DDP waits for PSH flag + int ddp_rcvcoalesce; // whether receive coalescing is enabled + int zcopy_sosend_enabled; // < is never zcopied + int zcopy_sosend_partial_thres; // < is never zcopied + int zcopy_sosend_partial_copy; // bytes copied in partial zcopy + int zcopy_sosend_thres;// >= are mostly zcopied + int zcopy_sosend_copy; // bytes coped in zcopied + int zcopy_sosend_ret_pending_dma;// pot. return while pending DMA + int activated; // TOE engine activation state +}; + +struct tom_data { + TAILQ_ENTRY(tom_data) entry; + + struct t3cdev *cdev; + struct pci_dev *pdev; + struct toedev tdev; + + struct cxgb_client *client; + struct tom_tunables conf; + struct tom_sysctl_table *sysctl; + + /* + * The next three locks listen_lock, deferq.lock, and tid_release_lock + * are used rarely so we let them potentially share a cacheline. + */ + + struct listen_info *listen_hash_tab[LISTEN_INFO_HASH_SIZE]; + struct mtx listen_lock; + + struct mbuf_head deferq; + struct task deferq_task; + + struct socket **tid_release_list; + struct mtx tid_release_lock; + struct task tid_release_task; + + volatile int tx_dma_pending; + + unsigned int ddp_llimit; + unsigned int ddp_ulimit; + + unsigned int rx_page_size; + + u8 *ppod_map; + unsigned int nppods; + struct mtx ppod_map_lock; + + struct adap_ports *ports; + struct taskqueue *tq; +}; + + +struct listen_ctx { + struct socket *lso; + struct tom_data *tom_data; + int ulp_mode; + LIST_HEAD(, toepcb) synq_head; + +}; + +#define TOM_DATA(dev) (*(struct tom_data **)&(dev)->tod_l4opt) +#define T3C_DEV(sk) ((TOM_DATA(TOE_DEV(sk)))->cdev) +#define TOEP_T3C_DEV(toep) (TOM_DATA(toep->tp_toedev)->cdev) +#define TOM_TUNABLE(dev, param) (TOM_DATA(dev)->conf.param) + +#define TP_DATASENT (1 << 0) +#define TP_TX_WAIT_IDLE (1 << 1) +#define TP_FIN_SENT (1 << 2) +#define TP_ABORT_RPL_PENDING (1 << 3) +#define TP_ABORT_SHUTDOWN (1 << 4) +#define TP_ABORT_RPL_RCVD (1 << 5) +#define TP_ABORT_REQ_RCVD (1 << 6) +#define TP_CLOSE_CON_REQUESTED (1 << 7) +#define TP_SYN_RCVD (1 << 8) +#define TP_ESTABLISHED (1 << 9) + +void t3_init_tunables(struct tom_data *t); + +void t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p); + +static __inline struct mbuf * +m_gethdr_nofail(int len) +{ + struct mbuf *m; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + panic("implement lowmem cache\n"); + } + + KASSERT(len < MHLEN, ("requested header size too large for mbuf")); + m->m_pkthdr.len = m->m_len = len; + return (m); +} + + +#endif diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c new file mode 100644 index 0000000000000..1490bfbdc29bd --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom_sysctl.c @@ -0,0 +1,119 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/fcntl.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/mutex.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/socketvar.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> + +#include <dev/cxgb/cxgb_osdep.h> +#include <dev/cxgb/sys/mbufq.h> + +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_fsm.h> +#include <net/route.h> + +#include <dev/cxgb/t3cdev.h> +#include <dev/cxgb/common/cxgb_firmware_exports.h> +#include <dev/cxgb/common/cxgb_tcb.h> +#include <dev/cxgb/common/cxgb_ctl_defs.h> +#include <dev/cxgb/common/cxgb_t3_cpl.h> +#include <dev/cxgb/cxgb_offload.h> +#include <dev/cxgb/cxgb_include.h> +#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> +#include <dev/cxgb/ulp/tom/cxgb_tom.h> +#include <dev/cxgb/ulp/tom/cxgb_defs.h> +#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> + +static struct tom_tunables default_tunable_vals = { + .max_host_sndbuf = 32 * 1024, + .tx_hold_thres = 0, + .max_wrs = 15, + .rx_credit_thres = 15 * 1024, + .cong_alg = -1, + .mss = 16384, + .delack = 1, + .max_conn = -1, + .soft_backlog_limit = 0, + .ddp = 1, + .ddp_thres = 14 * 4096, + .ddp_copy_limit = 13 * 4096, + .ddp_push_wait = 1, + .ddp_rcvcoalesce = 0, + .zcopy_sosend_enabled = 0, + .zcopy_sosend_partial_thres = 40960, + .zcopy_sosend_partial_copy = 4096 * 3, + .zcopy_sosend_thres = 128 * 1024, + .zcopy_sosend_copy = 4096 * 2, + .zcopy_sosend_ret_pending_dma = 1, + .activated = 1, +}; + +void +t3_init_tunables(struct tom_data *t) +{ + t->conf = default_tunable_vals; + + /* Now apply device specific fixups. */ + t->conf.mss = T3C_DATA(t->cdev)->tx_max_chunk; + t->conf.max_wrs = T3C_DATA(t->cdev)->max_wrs; +} + +void +t3_sysctl_register(struct adapter *sc, const struct tom_tunables *p) +{ + struct sysctl_ctx_list *ctx; + struct sysctl_oid_list *children; + + ctx = device_get_sysctl_ctx(sc->dev); + children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); + +} + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.c b/sys/dev/cxgb/ulp/tom/cxgb_vm.c new file mode 100644 index 0000000000000..7036005e93e04 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.c @@ -0,0 +1,180 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +***************************************************************************/ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/limits.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/condvar.h> +#include <sys/mutex.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <dev/cxgb/ulp/tom/cxgb_vm.h> + +#define TRACE_ENTER printf("%s:%s entered", __FUNCTION__, __FILE__) +#define TRACE_EXIT printf("%s:%s:%d exited", __FUNCTION__, __FILE__, __LINE__) + +/* + * This routine takes a user address range and does the following: + * - validate that the user has access to those pages (flags indicates read or write) - if not fail + * - validate that count is enough to hold range number of pages - if not fail + * - fault in any non-resident pages + * - if the user is doing a read force a write fault for any COWed pages + * - if the user is doing a read mark all pages as dirty + * - hold all pages + * - return number of pages in count + */ +int +vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags) +{ + + vm_offset_t end, va; + vm_paddr_t pa; + int faults, rv; + + struct thread *td; + vm_map_t map; + pmap_t pmap; + vm_page_t m, *pages; + vm_prot_t prot; + + + /* + * Check that virtual address range is legal + * This check is somewhat bogus as on some architectures kernel + * and user do not share VA - however, it appears that all FreeBSD + * architectures define it + */ + end = addr + (count * PAGE_SIZE); + if (end > VM_MAXUSER_ADDRESS) { + printf("bad address passed\n"); + return (EFAULT); + } + + td = curthread; + map = &td->td_proc->p_vmspace->vm_map; + pmap = &td->td_proc->p_vmspace->vm_pmap; + pages = mp; + + prot = VM_PROT_READ; + prot |= (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : 0; + bzero(pages, sizeof(vm_page_t *) * count); +retry: + + /* + * First optimistically assume that all pages are resident (and R/W if for write) + * if so just mark pages as held (and dirty if for write) and return + */ + vm_page_lock_queues(); + for (pages = mp, faults = 0, va = addr; va < end; va += PAGE_SIZE, pages++) { + /* + * Assure that we only hold the page once + */ + if (*pages == NULL) { + /* + * page queue mutex is recursable so this is OK + * it would be really nice if we had an unlocked version of this so + * we were only acquiring the pmap lock 1 time as opposed to potentially + * many dozens of times + */ + m = pmap_extract_and_hold(pmap, va, prot); + if (m == NULL) { + faults++; + continue; + } + + *pages = m; + if (flags & VM_HOLD_WRITEABLE) + vm_page_dirty(m); + } + } + vm_page_unlock_queues(); + + if (faults == 0) { + return (0); + } + + /* + * Pages either have insufficient permissions or are not present + * trigger a fault where neccessary + * + */ + for (va = addr; va < end; va += PAGE_SIZE) { + m = NULL; + pa = pmap_extract(pmap, va); + rv = 0; + if (pa) + m = PHYS_TO_VM_PAGE(pa); + if (flags & VM_HOLD_WRITEABLE) { + if (m == NULL || (m->flags & PG_WRITEABLE) == 0) + rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); + } else if (m == NULL) + rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL); + if (rv) { + printf("vm_fault bad return rv=%d va=0x%zx\n", rv, va); + + goto error; + } + } + + goto retry; + +error: + vm_page_lock_queues(); + for (pages = mp, va = addr; va < end; va += PAGE_SIZE, pages++) + if (*pages) + vm_page_unhold(*pages); + vm_page_unlock_queues(); + return (EFAULT); +} + +void +vm_fault_unhold_pages(vm_page_t *mp, int count) +{ + + KASSERT(count >= 0, ("negative count %d", count)); + vm_page_lock_queues(); + while (count--) { + vm_page_unhold(*mp); + mp++; + } + vm_page_unlock_queues(); +} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_vm.h b/sys/dev/cxgb/ulp/tom/cxgb_vm.h new file mode 100644 index 0000000000000..29418b616fd4f --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_vm.h @@ -0,0 +1,40 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ +#ifndef CXGB_VM_H_ +#define CXGB_VM_H_ + +#define VM_HOLD_WRITEABLE 0x1 + +int vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags); +void vm_fault_unhold_pages(vm_page_t *mp, int count); + +#endif diff --git a/sys/modules/cxgb/Makefile b/sys/modules/cxgb/Makefile index 6e35a6ee0d86a..85c6f4875d5a1 100644 --- a/sys/modules/cxgb/Makefile +++ b/sys/modules/cxgb/Makefile @@ -1,5 +1,16 @@ # $FreeBSD$ SUBDIR= cxgb +#SUBDIR+= toecore +#SUBDIR+= tom +#SUBDIR+= ${_iw_cxgb} SUBDIR+= cxgb_t3fw +.if ${MACHINE_ARCH} == "i386" +_iw_cxgb = iw_cxgb +.endif + +.if ${MACHINE_ARCH} == "amd64" +_iw_cxgb = iw_cxgb +.endif + .include <bsd.subdir.mk> diff --git a/sys/modules/cxgb/cxgb/Makefile b/sys/modules/cxgb/cxgb/Makefile index 039032da8f6a7..64044e86e4d00 100644 --- a/sys/modules/cxgb/cxgb/Makefile +++ b/sys/modules/cxgb/cxgb/Makefile @@ -4,15 +4,21 @@ CXGB = ${.CURDIR}/../../../dev/cxgb .PATH: ${CXGB} ${CXGB}/common ${CXGB}/sys KMOD= if_cxgb -SRCS= cxgb_mc5.c cxgb_vsc8211.c cxgb_ael1002.c cxgb_mv88e1xxx.c +SRCS= cxgb_mc5.c cxgb_vsc8211.c cxgb_ael1002.c cxgb_mv88e1xxx.c SRCS+= cxgb_xgmac.c cxgb_vsc7323.c cxgb_t3_hw.c cxgb_main.c -SRCS+= cxgb_sge.c cxgb_lro.c cxgb_offload.c -SRCS+= device_if.h bus_if.h pci_if.h opt_zero.h -SRCS+= uipc_mvec.c cxgb_support.c cxgb_multiq.c - -CFLAGS+= -DCONFIG_CHELSIO_T3_CORE -g -DCONFIG_DEFINED -DDEFAULT_JUMBO -I${CXGB} +SRCS+= cxgb_sge.c cxgb_lro.c cxgb_offload.c cxgb_tn1010.c +SRCS+= device_if.h bus_if.h pci_if.h opt_zero.h opt_sched.h +SRCS+= uipc_mvec.c cxgb_support.c cxgb_multiq.c +CFLAGS+= -DCONFIG_CHELSIO_T3_CORE -g -DCONFIG_DEFINED -DDEFAULT_JUMBO -I${CXGB} -DSMP +CFLAGS+= -DDISABLE_MBUF_IOVEC +#CFLAGS+= -DIFNET_MULTIQUEUE +#CFLAGS+= -DDISABLE_MBUF_IOVEC #CFLAGS+= -DDEBUG -DDEBUG_PRINT +#CFLAGS+= -DINVARIANT_SUPPORT -DINVARIANTS +#CFLAGS+= -DWITNESS +#CFLAGS += -DLOCK_PROFILING +#CFLAGS+= -DWITNESS #CFLAGS+= -DINVARIANT_SUPPORT -DINVARIANTS -DWITNESS -.include <bsd.kmod.mk>
\ No newline at end of file +.include <bsd.kmod.mk> diff --git a/sys/modules/cxgb/cxgb_t3fw/Makefile b/sys/modules/cxgb/cxgb_t3fw/Makefile index 787c9d41fbcc2..c35d73a41f025 100644 --- a/sys/modules/cxgb/cxgb_t3fw/Makefile +++ b/sys/modules/cxgb/cxgb_t3fw/Makefile @@ -3,6 +3,7 @@ CXGB = ${.CURDIR}/../../../dev/cxgb .PATH: ${CXGB} +KMOD= cxgb_t3fw SRCS+= cxgb_t3fw.c .include <bsd.kmod.mk> diff --git a/sys/modules/cxgb/iw_cxgb/Makefile b/sys/modules/cxgb/iw_cxgb/Makefile new file mode 100644 index 0000000000000..e1123bba3faff --- /dev/null +++ b/sys/modules/cxgb/iw_cxgb/Makefile @@ -0,0 +1,14 @@ +# $FreeBSD$ + +CXGB = ${.CURDIR}/../../../dev/cxgb +.PATH: ${IW_CXGB} ${CXGB}/common ${CXGB}/ulp/iw_cxgb + +KMOD= iw_cxgb +SRCS= iw_cxgb.c iw_cxgb_cm.c iw_cxgb_hal.c +SRCS+= iw_cxgb_provider.c iw_cxgb_qp.c iw_cxgb_resource.c +SRCS+= iw_cxgb_ev.c iw_cxgb_mem.c iw_cxgb_dbg.c iw_cxgb_cq.c +SRCS+= bus_if.h device_if.h opt_sched.h pci_if.h pcib_if.h opt_ktr.h +CFLAGS+= -DCONFIG_CHELSIO_T3_CORE -g -DCONFIG_DEFINED -I${CXGB} -DSMP +#CFLAGS+= -DDEBUG + +.include <bsd.kmod.mk> diff --git a/sys/modules/cxgb/toecore/Makefile b/sys/modules/cxgb/toecore/Makefile new file mode 100644 index 0000000000000..1c05d799a5f88 --- /dev/null +++ b/sys/modules/cxgb/toecore/Makefile @@ -0,0 +1,8 @@ +# $FreeBSD$ +TOECORE = ${.CURDIR}/../../../dev/cxgb/ulp/toecore +.PATH: ${TOECORE} + +KMOD= toecore +SRCS= toedev.c +SRCS+= device_if.h bus_if.h pci_if.h opt_sched.h +.include <bsd.kmod.mk>
\ No newline at end of file diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile new file mode 100644 index 0000000000000..2417edf1fc40e --- /dev/null +++ b/sys/modules/cxgb/tom/Makefile @@ -0,0 +1,14 @@ +# $FreeBSD$ + +TOM = ${.CURDIR}/../../../dev/cxgb/ulp/tom +.PATH: ${TOM} + +KMOD= tom +SRCS= cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c +SRCS+= cxgb_ddp.c cxgb_vm.c cxgb_l2t.c cxgb_tcp_offload.c +SRCS+= opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h opt_mac.h +SRCS+= opt_tcpdebug.h opt_ddb.h opt_sched.h opt_global.h opt_ktr.h +SRCS+= device_if.h bus_if.h pci_if.h + +#CFLAGS+= -DDEBUG_PRINT -DDEBUG +.include <bsd.kmod.mk> |