36 #include <linux/icmpv6.h>
38 #include <linux/slab.h>
48 "Max number of connected-mode QPs per interface "
49 "(applied only if shared receive queue is not available)");
51 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
52 static int data_debug_level;
56 "Enable data path debug tracing for connected mode if > 0");
59 #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
61 #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
62 #define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ)
63 #define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
64 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
70 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
72 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
87 for (i = 0; i < frags; ++
i)
91 static int ipoib_cm_post_receive_srq(
struct net_device *
dev,
int id)
99 for (i = 0; i < priv->cm.num_frags; ++
i)
100 priv->cm.
rx_sge[i].addr = priv->cm.srq_ring[
id].mapping[i];
102 ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.
rx_wr, &bad_wr);
104 ipoib_warn(priv,
"post srq failed for buf %d (%d)\n",
id, ret);
105 ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
106 priv->cm.srq_ring[
id].mapping);
108 priv->cm.srq_ring[
id].skb =
NULL;
114 static int ipoib_cm_post_receive_nonsrq(
struct net_device *dev,
128 ret = ib_post_recv(rx->
qp, wr, &bad_wr);
130 ipoib_warn(priv,
"post recv failed for buf %d (%d)\n",
id, ret);
131 ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
157 skb_reserve(skb, 12);
166 for (i = 0; i < frags; i++) {
171 skb_fill_page_desc(skb, i, page, 0,
PAGE_SIZE);
173 mapping[i + 1] = ib_dma_map_page(priv->
ca, page,
193 static void ipoib_cm_free_rx_ring(
struct net_device *dev,
200 if (rx_ring[i].skb) {
201 ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
216 if (list_empty(&priv->cm.rx_flush_list) ||
217 !list_empty(&priv->cm.rx_drain_list))
225 if (ib_post_send(p->
qp, &ipoib_cm_rx_drain_wr, &bad_wr))
226 ipoib_warn(priv,
"failed to post drain wr\n");
228 list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
241 list_move(&p->
list, &priv->cm.rx_flush_list);
243 ipoib_cm_start_rx_drain(priv);
244 spin_unlock_irqrestore(&priv->
lock, flags);
252 .event_handler = ipoib_cm_rx_event_handler,
256 .cap.max_send_wr = 1,
257 .cap.max_send_sge = 1,
263 if (!ipoib_cm_has_srq(dev)) {
264 attr.
cap.max_recv_wr = ipoib_recvq_size;
271 static int ipoib_cm_modify_rx_qp(
struct net_device *dev,
277 int qp_attr_mask,
ret;
282 ipoib_warn(priv,
"failed to init QP attr for INIT: %d\n", ret);
287 ipoib_warn(priv,
"failed to modify QP to INIT: %d\n", ret);
293 ipoib_warn(priv,
"failed to init QP attr for RTR: %d\n", ret);
296 qp_attr.rq_psn = psn;
299 ipoib_warn(priv,
"failed to modify QP to RTR: %d\n", ret);
314 ipoib_warn(priv,
"failed to init QP attr for RTS: %d\n", ret);
319 ipoib_warn(priv,
"failed to modify QP to RTS: %d\n", ret);
326 static void ipoib_cm_init_rx_wr(
struct net_device *dev,
333 for (i = 0; i < priv->cm.num_frags; ++
i)
334 sge[i].
lkey = priv->
mr->lkey;
337 for (i = 1; i < priv->cm.num_frags; ++
i)
342 wr->
num_sge = priv->cm.num_frags;
351 struct ib_sge sge[IPOIB_CM_RX_SG];
359 priv->
ca->name, ipoib_recvq_size);
369 ipoib_cm_init_rx_wr(dev, &
t->wr,
t->sge);
371 spin_lock_irq(&priv->
lock);
374 spin_unlock_irq(&priv->
lock);
379 ++priv->cm.nonsrq_conn_qp;
381 spin_unlock_irq(&priv->
lock);
383 for (i = 0; i < ipoib_recvq_size; ++
i) {
384 if (!ipoib_cm_alloc_rx_skb(dev, rx->
rx_ring, i, IPOIB_CM_RX_SG - 1,
386 ipoib_warn(priv,
"failed to allocate receive buffer %d\n", i);
390 ret = ipoib_cm_post_receive_nonsrq(dev, rx, &
t->wr,
t->sge, i);
392 ipoib_warn(priv,
"ipoib_cm_post_receive_nonsrq "
393 "failed for buf %d\n", i);
406 spin_lock_irq(&priv->
lock);
407 --priv->cm.nonsrq_conn_qp;
408 spin_unlock_irq(&priv->
lock);
412 ipoib_cm_free_rx_ring(dev, rx->
rx_ring);
432 rep.
srq = ipoib_cm_has_srq(dev);
455 INIT_LIST_HEAD(&p->
list);
457 p->
qp = ipoib_cm_create_rx_qp(dev, p);
459 ret = PTR_ERR(p->
qp);
464 ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->
qp, psn);
468 if (!ipoib_cm_has_srq(dev)) {
469 ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
474 spin_lock_irq(&priv->
lock);
481 list_move(&p->
list, &priv->cm.passive_ids);
482 spin_unlock_irq(&priv->
lock);
484 ret = ipoib_cm_send_rep(dev, cm_id, p->
qp, &event->
param.
req_rcvd, psn);
486 ipoib_warn(priv,
"failed to send REP: %d\n", ret);
488 ipoib_warn(priv,
"unable to move qp to error state\n");
499 static int ipoib_cm_rx_handler(
struct ib_cm_id *cm_id,
505 switch (event->
event) {
507 return ipoib_cm_req_handler(cm_id, event);
514 priv = netdev_priv(p->
dev);
516 ipoib_warn(priv,
"unable to move qp to error state\n");
523 static void skb_put_frags(
struct sk_buff *skb,
unsigned int hdr_space,
530 size =
min(length, hdr_space);
535 num_frags = skb_shinfo(skb)->nr_frags;
536 for (i = 0; i < num_frags; i++) {
541 skb_fill_page_desc(toskb, i, skb_frag_page(frag),
543 --skb_shinfo(skb)->nr_frags;
547 skb_frag_size_set(frag, size);
572 if (
unlikely(wr_id >= ipoib_recvq_size)) {
575 list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
576 ipoib_cm_start_rx_drain(priv);
578 spin_unlock_irqrestore(&priv->
lock, flags);
580 ipoib_warn(priv,
"cm recv completion event with wrid %d (> %d)\n",
581 wr_id, ipoib_recvq_size);
585 p = wc->
qp->qp_context;
587 has_srq = ipoib_cm_has_srq(dev);
588 rx_ring = has_srq ? priv->cm.srq_ring : p->
rx_ring;
590 skb = rx_ring[wr_id].
skb;
594 "(status=%d, wrid=%d vend_err %x)\n",
596 ++dev->
stats.rx_dropped;
602 list_move(&p->
list, &priv->cm.rx_reap_list);
603 spin_unlock_irqrestore(&priv->
lock, flags);
617 list_move(&p->
list, &priv->cm.passive_ids);
618 spin_unlock_irqrestore(&priv->
lock, flags);
625 small_skb = dev_alloc_skb(dlen + 12);
627 skb_reserve(small_skb, 12);
628 ib_dma_sync_single_for_cpu(priv->
ca, rx_ring[wr_id].
mapping[0],
630 skb_copy_from_linear_data(skb, small_skb->
data, dlen);
631 ib_dma_sync_single_for_device(priv->
ca, rx_ring[wr_id].
mapping[0],
642 newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping);
648 ipoib_dbg(priv,
"failed to allocate receive buffer %d\n", wr_id);
649 ++dev->
stats.rx_dropped;
653 ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
654 memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) *
sizeof *mapping);
659 skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->
byte_len, newskb);
663 skb_reset_mac_header(skb);
666 ++dev->
stats.rx_packets;
676 if (
unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
677 ipoib_warn(priv,
"ipoib_cm_post_receive_srq failed "
678 "for buf %d\n", wr_id);
680 if (
unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
685 ipoib_warn(priv,
"ipoib_cm_post_receive_nonsrq failed "
686 "for buf %d\n", wr_id);
699 priv->
tx_sge[0].length = len;
701 priv->
tx_wr.num_sge = 1;
704 return ib_post_send(tx->
qp, &priv->
tx_wr, &bad_wr);
715 ipoib_warn(priv,
"packet len %d (> %d) too long to send, dropping\n",
717 ++dev->
stats.tx_dropped;
718 ++dev->
stats.tx_errors;
723 ipoib_dbg_data(priv,
"sending packet: head 0x%x length %d connection 0x%x\n",
736 if (
unlikely(ib_dma_mapping_error(priv->
ca, addr))) {
737 ++dev->
stats.tx_errors;
747 ipoib_warn(priv,
"post_send failed, error %d\n", rc);
748 ++dev->
stats.tx_errors;
756 ipoib_dbg(priv,
"TX ring 0x%x full, stopping kernel net queue\n",
759 ipoib_warn(priv,
"request notify on send CQ failed\n");
760 netif_stop_queue(dev);
777 ipoib_warn(priv,
"cm send completion event with wrid %d (> %d)\n",
787 ++dev->
stats.tx_packets;
788 dev->
stats.tx_bytes += tx_req->
skb->len;
796 netif_queue_stopped(dev) &&
798 netif_wake_queue(dev);
805 "(status=%d, wrid=%d vend_err %x)\n",
820 list_move(&tx->
list, &priv->cm.reap_list);
826 spin_unlock_irqrestore(&priv->
lock, flags);
829 netif_tx_unlock(dev);
841 if (IS_ERR(priv->cm.id)) {
843 ret = PTR_ERR(priv->cm.id);
864 static void ipoib_cm_free_rx_reap_list(
struct net_device *dev)
870 spin_lock_irq(&priv->
lock);
871 list_splice_init(&priv->cm.rx_reap_list, &
list);
872 spin_unlock_irq(&priv->
lock);
877 if (!ipoib_cm_has_srq(dev)) {
878 ipoib_cm_free_rx_ring(priv->
dev, rx->
rx_ring);
879 spin_lock_irq(&priv->
lock);
880 --priv->cm.nonsrq_conn_qp;
881 spin_unlock_irq(&priv->
lock);
900 spin_lock_irq(&priv->
lock);
901 while (!list_empty(&priv->cm.passive_ids)) {
903 list_move(&p->
list, &priv->cm.rx_error_list);
905 spin_unlock_irq(&priv->
lock);
908 ipoib_warn(priv,
"unable to move qp to error state: %d\n", ret);
909 spin_lock_irq(&priv->
lock);
915 while (!list_empty(&priv->cm.rx_error_list) ||
916 !list_empty(&priv->cm.rx_flush_list) ||
917 !list_empty(&priv->cm.rx_drain_list)) {
924 list_splice_init(&priv->cm.rx_flush_list,
925 &priv->cm.rx_reap_list);
926 list_splice_init(&priv->cm.rx_error_list,
927 &priv->cm.rx_reap_list);
928 list_splice_init(&priv->cm.rx_drain_list,
929 &priv->cm.rx_reap_list);
932 spin_unlock_irq(&priv->
lock);
935 spin_lock_irq(&priv->
lock);
938 spin_unlock_irq(&priv->
lock);
940 ipoib_cm_free_rx_reap_list(dev);
952 int qp_attr_mask,
ret;
958 ipoib_warn(priv,
"Rejecting connection: mtu %d <= %d\n",
966 ipoib_warn(priv,
"failed to init QP attr for RTR: %d\n", ret);
973 ipoib_warn(priv,
"failed to modify QP to RTR: %d\n", ret);
980 ipoib_warn(priv,
"failed to init QP attr for RTS: %d\n", ret);
985 ipoib_warn(priv,
"failed to modify QP to RTS: %d\n", ret);
989 skb_queue_head_init(&skqueue);
991 spin_lock_irq(&priv->
lock);
994 while ((skb = __skb_dequeue(&p->
neigh->queue)))
995 __skb_queue_tail(&skqueue, skb);
996 spin_unlock_irq(&priv->
lock);
998 while ((skb = __skb_dequeue(&skqueue))) {
1002 "to requeue packet\n");
1007 ipoib_warn(priv,
"failed to send RTU: %d\n", ret);
1019 .srq = priv->cm.srq,
1021 .cap.max_send_sge = 1,
1030 static int ipoib_cm_send_req(
struct net_device *dev,
1063 req.
srq = ipoib_cm_has_srq(dev);
1067 static int ipoib_cm_modify_tx_init(
struct net_device *dev,
1072 int qp_attr_mask,
ret;
1081 qp_attr.port_num = priv->
port;
1086 ipoib_warn(priv,
"failed to modify tx QP to INIT: %d\n", ret);
1100 ipoib_warn(priv,
"failed to allocate tx ring\n");
1105 p->
qp = ipoib_cm_create_tx_qp(p->
dev, p);
1106 if (IS_ERR(p->
qp)) {
1107 ret = PTR_ERR(p->
qp);
1108 ipoib_warn(priv,
"failed to allocate tx qp: %d\n", ret);
1113 if (IS_ERR(p->
id)) {
1114 ret = PTR_ERR(p->
id);
1115 ipoib_warn(priv,
"failed to create tx cm id: %d\n", ret);
1119 ret = ipoib_cm_modify_tx_init(p->
dev, p->
id, p->
qp);
1121 ipoib_warn(priv,
"failed to modify tx qp to rtr: %d\n", ret);
1125 ret = ipoib_cm_send_req(p->
dev, p->
id, p->
qp, qpn, pathrec);
1127 ipoib_warn(priv,
"failed to send cm req: %d\n", ret);
1131 ipoib_dbg(priv,
"Request connection 0x%x for gid %pI6 qpn 0x%x\n",
1149 static void ipoib_cm_tx_destroy(
struct ipoib_cm_tx *p)
1153 unsigned long begin;
1155 ipoib_dbg(priv,
"Destroy active connection 0x%x head 0x%x tail 0x%x\n",
1166 ipoib_warn(priv,
"timing out; %d sends not completed\n",
1179 ib_dma_unmap_single(priv->
ca, tx_req->
mapping, tx_req->
skb->len,
1183 netif_tx_lock_bh(p->
dev);
1185 netif_queue_stopped(p->
dev) &&
1187 netif_wake_queue(p->
dev);
1188 netif_tx_unlock_bh(p->
dev);
1198 static int ipoib_cm_tx_handler(
struct ib_cm_id *cm_id,
1205 unsigned long flags;
1208 switch (event->
event) {
1215 ret = ipoib_cm_rep_handler(cm_id, event);
1224 netif_tx_lock_bh(dev);
1237 list_move(&tx->
list, &priv->cm.reap_list);
1241 spin_unlock_irqrestore(&priv->
lock, flags);
1242 netif_tx_unlock_bh(dev);
1265 list_add(&tx->
list, &priv->cm.start_list);
1274 unsigned long flags;
1277 list_move(&tx->
list, &priv->cm.reap_list);
1279 ipoib_dbg(priv,
"Reap connection for gid %pI6\n",
1280 tx->
neigh->daddr + 4);
1282 spin_unlock_irqrestore(&priv->
lock, flags);
1293 unsigned long flags;
1299 netif_tx_lock_bh(dev);
1302 while (!list_empty(&priv->cm.start_list)) {
1304 list_del_init(&p->
list);
1307 memcpy(&pathrec, &p->
path->pathrec,
sizeof pathrec);
1309 spin_unlock_irqrestore(&priv->
lock, flags);
1310 netif_tx_unlock_bh(dev);
1312 ret = ipoib_cm_tx_init(p, qpn, &pathrec);
1314 netif_tx_lock_bh(dev);
1329 spin_unlock_irqrestore(&priv->
lock, flags);
1330 netif_tx_unlock_bh(dev);
1333 static void ipoib_cm_tx_reap(
struct work_struct *work)
1339 unsigned long flags;
1341 netif_tx_lock_bh(dev);
1344 while (!list_empty(&priv->cm.reap_list)) {
1347 spin_unlock_irqrestore(&priv->
lock, flags);
1348 netif_tx_unlock_bh(dev);
1349 ipoib_cm_tx_destroy(p);
1350 netif_tx_lock_bh(dev);
1354 spin_unlock_irqrestore(&priv->
lock, flags);
1355 netif_tx_unlock_bh(dev);
1358 static void ipoib_cm_skb_reap(
struct work_struct *work)
1364 unsigned long flags;
1367 netif_tx_lock_bh(dev);
1370 while ((skb =
skb_dequeue(&priv->cm.skb_queue))) {
1371 spin_unlock_irqrestore(&priv->
lock, flags);
1372 netif_tx_unlock_bh(dev);
1376 #if IS_ENABLED(CONFIG_IPV6)
1382 netif_tx_lock_bh(dev);
1386 spin_unlock_irqrestore(&priv->
lock, flags);
1387 netif_tx_unlock_bh(dev);
1394 int e = skb_queue_empty(&priv->cm.skb_queue);
1397 skb_dst(skb)->ops->update_pmtu(skb_dst(skb),
NULL, skb, mtu);
1404 static void ipoib_cm_rx_reap(
struct work_struct *work)
1407 cm.rx_reap_task)->dev);
1410 static void ipoib_cm_stale_task(
struct work_struct *work)
1413 cm.stale_task.work);
1417 spin_lock_irq(&priv->
lock);
1418 while (!list_empty(&priv->cm.passive_ids)) {
1424 list_move(&p->
list, &priv->cm.rx_error_list);
1426 spin_unlock_irq(&priv->
lock);
1429 ipoib_warn(priv,
"unable to move qp to error state: %d\n", ret);
1430 spin_lock_irq(&priv->
lock);
1433 if (!list_empty(&priv->cm.passive_ids))
1436 spin_unlock_irq(&priv->
lock);
1446 return sprintf(buf,
"connected\n");
1448 return sprintf(buf,
"datagram\n");
1452 const char *buf,
size_t count)
1458 return restart_syscall();
1483 .max_wr = ipoib_recvq_size,
1489 if (IS_ERR(priv->cm.srq)) {
1490 if (PTR_ERR(priv->cm.srq) != -
ENOSYS)
1492 priv->
ca->name, PTR_ERR(priv->cm.srq));
1493 priv->cm.srq =
NULL;
1497 priv->cm.srq_ring =
vzalloc(ipoib_recvq_size *
sizeof *priv->cm.srq_ring);
1498 if (!priv->cm.srq_ring) {
1500 priv->
ca->name, ipoib_recvq_size);
1502 priv->cm.srq =
NULL;
1514 INIT_LIST_HEAD(&priv->cm.passive_ids);
1515 INIT_LIST_HEAD(&priv->cm.reap_list);
1516 INIT_LIST_HEAD(&priv->cm.start_list);
1517 INIT_LIST_HEAD(&priv->cm.rx_error_list);
1518 INIT_LIST_HEAD(&priv->cm.rx_flush_list);
1519 INIT_LIST_HEAD(&priv->cm.rx_drain_list);
1520 INIT_LIST_HEAD(&priv->cm.rx_reap_list);
1521 INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
1522 INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
1523 INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
1524 INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
1527 skb_queue_head_init(&priv->cm.skb_queue);
1539 if (ipoib_cm_has_srq(dev)) {
1542 ipoib_dbg(priv,
"max_cm_mtu = 0x%x, num_frags=%d\n",
1543 priv->cm.max_cm_mtu, priv->cm.num_frags);
1549 ipoib_cm_init_rx_wr(dev, &priv->cm.
rx_wr, priv->cm.
rx_sge);
1551 if (ipoib_cm_has_srq(dev)) {
1552 for (i = 0; i < ipoib_recvq_size; ++
i) {
1553 if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
1554 priv->cm.num_frags - 1,
1555 priv->cm.srq_ring[i].mapping)) {
1557 "receive buffer %d\n", i);
1562 if (ipoib_cm_post_receive_srq(dev, i)) {
1563 ipoib_warn(priv,
"ipoib_cm_post_receive_srq "
1564 "failed for buf %d\n", i);
1583 ipoib_dbg(priv,
"Cleanup ipoib connected mode.\n");
1587 ipoib_warn(priv,
"ib_destroy_srq failed: %d\n", ret);
1589 priv->cm.srq =
NULL;
1590 if (!priv->cm.srq_ring)
1593 ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
1594 priv->cm.srq_ring =
NULL;