Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
ipoib_cm.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2006 Mellanox Technologies. All rights reserved
3  *
4  * This software is available to you under a choice of one of two
5  * licenses. You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  * Redistribution and use in source and binary forms, with or
11  * without modification, are permitted provided that the following
12  * conditions are met:
13  *
14  * - Redistributions of source code must retain the above
15  * copyright notice, this list of conditions and the following
16  * disclaimer.
17  *
18  * - Redistributions in binary form must reproduce the above
19  * copyright notice, this list of conditions and the following
20  * disclaimer in the documentation and/or other materials
21  * provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <rdma/ib_cm.h>
34 #include <net/dst.h>
35 #include <net/icmp.h>
36 #include <linux/icmpv6.h>
37 #include <linux/delay.h>
38 #include <linux/slab.h>
39 #include <linux/vmalloc.h>
40 #include <linux/moduleparam.h>
41 
42 #include "ipoib.h"
43 
45 
46 module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
47 MODULE_PARM_DESC(max_nonsrq_conn_qp,
48  "Max number of connected-mode QPs per interface "
49  "(applied only if shared receive queue is not available)");
50 
51 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
52 static int data_debug_level;
53 
54 module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
55 MODULE_PARM_DESC(cm_data_debug_level,
56  "Enable data path debug tracing for connected mode if > 0");
57 #endif
58 
59 #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
60 
61 #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
62 #define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ)
63 #define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
64 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
65 
66 static struct ib_qp_attr ipoib_cm_err_attr = {
67  .qp_state = IB_QPS_ERR
68 };
69 
70 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
71 
72 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
73  .wr_id = IPOIB_CM_RX_DRAIN_WRID,
74  .opcode = IB_WR_SEND,
75 };
76 
77 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
78  struct ib_cm_event *event);
79 
80 static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
82 {
83  int i;
84 
85  ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
86 
87  for (i = 0; i < frags; ++i)
88  ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
89 }
90 
91 static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
92 {
93  struct ipoib_dev_priv *priv = netdev_priv(dev);
94  struct ib_recv_wr *bad_wr;
95  int i, ret;
96 
97  priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
98 
99  for (i = 0; i < priv->cm.num_frags; ++i)
100  priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
101 
102  ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
103  if (unlikely(ret)) {
104  ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
105  ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
106  priv->cm.srq_ring[id].mapping);
107  dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
108  priv->cm.srq_ring[id].skb = NULL;
109  }
110 
111  return ret;
112 }
113 
114 static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
115  struct ipoib_cm_rx *rx,
116  struct ib_recv_wr *wr,
117  struct ib_sge *sge, int id)
118 {
119  struct ipoib_dev_priv *priv = netdev_priv(dev);
120  struct ib_recv_wr *bad_wr;
121  int i, ret;
122 
123  wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
124 
125  for (i = 0; i < IPOIB_CM_RX_SG; ++i)
126  sge[i].addr = rx->rx_ring[id].mapping[i];
127 
128  ret = ib_post_recv(rx->qp, wr, &bad_wr);
129  if (unlikely(ret)) {
130  ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
131  ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
132  rx->rx_ring[id].mapping);
133  dev_kfree_skb_any(rx->rx_ring[id].skb);
134  rx->rx_ring[id].skb = NULL;
135  }
136 
137  return ret;
138 }
139 
140 static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
141  struct ipoib_cm_rx_buf *rx_ring,
142  int id, int frags,
143  u64 mapping[IPOIB_CM_RX_SG])
144 {
145  struct ipoib_dev_priv *priv = netdev_priv(dev);
146  struct sk_buff *skb;
147  int i;
148 
149  skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
150  if (unlikely(!skb))
151  return NULL;
152 
153  /*
154  * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
155  * IP header to a multiple of 16.
156  */
157  skb_reserve(skb, 12);
158 
159  mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
161  if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
162  dev_kfree_skb_any(skb);
163  return NULL;
164  }
165 
166  for (i = 0; i < frags; i++) {
167  struct page *page = alloc_page(GFP_ATOMIC);
168 
169  if (!page)
170  goto partial_error;
171  skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
172 
173  mapping[i + 1] = ib_dma_map_page(priv->ca, page,
175  if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
176  goto partial_error;
177  }
178 
179  rx_ring[id].skb = skb;
180  return skb;
181 
182 partial_error:
183 
184  ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
185 
186  for (; i > 0; --i)
187  ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
188 
189  dev_kfree_skb_any(skb);
190  return NULL;
191 }
192 
193 static void ipoib_cm_free_rx_ring(struct net_device *dev,
194  struct ipoib_cm_rx_buf *rx_ring)
195 {
196  struct ipoib_dev_priv *priv = netdev_priv(dev);
197  int i;
198 
199  for (i = 0; i < ipoib_recvq_size; ++i)
200  if (rx_ring[i].skb) {
201  ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
202  rx_ring[i].mapping);
203  dev_kfree_skb_any(rx_ring[i].skb);
204  }
205 
206  vfree(rx_ring);
207 }
208 
209 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
210 {
211  struct ib_send_wr *bad_wr;
212  struct ipoib_cm_rx *p;
213 
214  /* We only reserved 1 extra slot in CQ for drain WRs, so
215  * make sure we have at most 1 outstanding WR. */
216  if (list_empty(&priv->cm.rx_flush_list) ||
217  !list_empty(&priv->cm.rx_drain_list))
218  return;
219 
220  /*
221  * QPs on flush list are error state. This way, a "flush
222  * error" WC will be immediately generated for each WR we post.
223  */
224  p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
225  if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
226  ipoib_warn(priv, "failed to post drain wr\n");
227 
228  list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
229 }
230 
231 static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
232 {
233  struct ipoib_cm_rx *p = ctx;
234  struct ipoib_dev_priv *priv = netdev_priv(p->dev);
235  unsigned long flags;
236 
237  if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
238  return;
239 
240  spin_lock_irqsave(&priv->lock, flags);
241  list_move(&p->list, &priv->cm.rx_flush_list);
243  ipoib_cm_start_rx_drain(priv);
244  spin_unlock_irqrestore(&priv->lock, flags);
245 }
246 
247 static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
248  struct ipoib_cm_rx *p)
249 {
250  struct ipoib_dev_priv *priv = netdev_priv(dev);
251  struct ib_qp_init_attr attr = {
252  .event_handler = ipoib_cm_rx_event_handler,
253  .send_cq = priv->recv_cq, /* For drain WR */
254  .recv_cq = priv->recv_cq,
255  .srq = priv->cm.srq,
256  .cap.max_send_wr = 1, /* For drain WR */
257  .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
258  .sq_sig_type = IB_SIGNAL_ALL_WR,
259  .qp_type = IB_QPT_RC,
260  .qp_context = p,
261  };
262 
263  if (!ipoib_cm_has_srq(dev)) {
264  attr.cap.max_recv_wr = ipoib_recvq_size;
265  attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
266  }
267 
268  return ib_create_qp(priv->pd, &attr);
269 }
270 
271 static int ipoib_cm_modify_rx_qp(struct net_device *dev,
272  struct ib_cm_id *cm_id, struct ib_qp *qp,
273  unsigned psn)
274 {
275  struct ipoib_dev_priv *priv = netdev_priv(dev);
276  struct ib_qp_attr qp_attr;
277  int qp_attr_mask, ret;
278 
279  qp_attr.qp_state = IB_QPS_INIT;
280  ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
281  if (ret) {
282  ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
283  return ret;
284  }
285  ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
286  if (ret) {
287  ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
288  return ret;
289  }
290  qp_attr.qp_state = IB_QPS_RTR;
291  ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
292  if (ret) {
293  ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
294  return ret;
295  }
296  qp_attr.rq_psn = psn;
297  ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
298  if (ret) {
299  ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
300  return ret;
301  }
302 
303  /*
304  * Current Mellanox HCA firmware won't generate completions
305  * with error for drain WRs unless the QP has been moved to
306  * RTS first. This work-around leaves a window where a QP has
307  * moved to error asynchronously, but this will eventually get
308  * fixed in firmware, so let's not error out if modify QP
309  * fails.
310  */
311  qp_attr.qp_state = IB_QPS_RTS;
312  ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
313  if (ret) {
314  ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
315  return 0;
316  }
317  ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
318  if (ret) {
319  ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
320  return 0;
321  }
322 
323  return 0;
324 }
325 
326 static void ipoib_cm_init_rx_wr(struct net_device *dev,
327  struct ib_recv_wr *wr,
328  struct ib_sge *sge)
329 {
330  struct ipoib_dev_priv *priv = netdev_priv(dev);
331  int i;
332 
333  for (i = 0; i < priv->cm.num_frags; ++i)
334  sge[i].lkey = priv->mr->lkey;
335 
336  sge[0].length = IPOIB_CM_HEAD_SIZE;
337  for (i = 1; i < priv->cm.num_frags; ++i)
338  sge[i].length = PAGE_SIZE;
339 
340  wr->next = NULL;
341  wr->sg_list = sge;
342  wr->num_sge = priv->cm.num_frags;
343 }
344 
345 static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
346  struct ipoib_cm_rx *rx)
347 {
348  struct ipoib_dev_priv *priv = netdev_priv(dev);
349  struct {
350  struct ib_recv_wr wr;
351  struct ib_sge sge[IPOIB_CM_RX_SG];
352  } *t;
353  int ret;
354  int i;
355 
356  rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring);
357  if (!rx->rx_ring) {
358  printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
359  priv->ca->name, ipoib_recvq_size);
360  return -ENOMEM;
361  }
362 
363  t = kmalloc(sizeof *t, GFP_KERNEL);
364  if (!t) {
365  ret = -ENOMEM;
366  goto err_free;
367  }
368 
369  ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
370 
371  spin_lock_irq(&priv->lock);
372 
373  if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
374  spin_unlock_irq(&priv->lock);
375  ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
376  ret = -EINVAL;
377  goto err_free;
378  } else
379  ++priv->cm.nonsrq_conn_qp;
380 
381  spin_unlock_irq(&priv->lock);
382 
383  for (i = 0; i < ipoib_recvq_size; ++i) {
384  if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
385  rx->rx_ring[i].mapping)) {
386  ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
387  ret = -ENOMEM;
388  goto err_count;
389  }
390  ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
391  if (ret) {
392  ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
393  "failed for buf %d\n", i);
394  ret = -EIO;
395  goto err_count;
396  }
397  }
398 
399  rx->recv_count = ipoib_recvq_size;
400 
401  kfree(t);
402 
403  return 0;
404 
405 err_count:
406  spin_lock_irq(&priv->lock);
407  --priv->cm.nonsrq_conn_qp;
408  spin_unlock_irq(&priv->lock);
409 
410 err_free:
411  kfree(t);
412  ipoib_cm_free_rx_ring(dev, rx->rx_ring);
413 
414  return ret;
415 }
416 
417 static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
418  struct ib_qp *qp, struct ib_cm_req_event_param *req,
419  unsigned psn)
420 {
421  struct ipoib_dev_priv *priv = netdev_priv(dev);
422  struct ipoib_cm_data data = {};
423  struct ib_cm_rep_param rep = {};
424 
425  data.qpn = cpu_to_be32(priv->qp->qp_num);
427 
428  rep.private_data = &data;
429  rep.private_data_len = sizeof data;
430  rep.flow_control = 0;
431  rep.rnr_retry_count = req->rnr_retry_count;
432  rep.srq = ipoib_cm_has_srq(dev);
433  rep.qp_num = qp->qp_num;
434  rep.starting_psn = psn;
435  return ib_send_cm_rep(cm_id, &rep);
436 }
437 
438 static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
439 {
440  struct net_device *dev = cm_id->context;
441  struct ipoib_dev_priv *priv = netdev_priv(dev);
442  struct ipoib_cm_rx *p;
443  unsigned psn;
444  int ret;
445 
446  ipoib_dbg(priv, "REQ arrived\n");
447  p = kzalloc(sizeof *p, GFP_KERNEL);
448  if (!p)
449  return -ENOMEM;
450  p->dev = dev;
451  p->id = cm_id;
452  cm_id->context = p;
453  p->state = IPOIB_CM_RX_LIVE;
454  p->jiffies = jiffies;
455  INIT_LIST_HEAD(&p->list);
456 
457  p->qp = ipoib_cm_create_rx_qp(dev, p);
458  if (IS_ERR(p->qp)) {
459  ret = PTR_ERR(p->qp);
460  goto err_qp;
461  }
462 
463  psn = random32() & 0xffffff;
464  ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
465  if (ret)
466  goto err_modify;
467 
468  if (!ipoib_cm_has_srq(dev)) {
469  ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
470  if (ret)
471  goto err_modify;
472  }
473 
474  spin_lock_irq(&priv->lock);
476  &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
477  /* Add this entry to passive ids list head, but do not re-add it
478  * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
479  p->jiffies = jiffies;
480  if (p->state == IPOIB_CM_RX_LIVE)
481  list_move(&p->list, &priv->cm.passive_ids);
482  spin_unlock_irq(&priv->lock);
483 
484  ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
485  if (ret) {
486  ipoib_warn(priv, "failed to send REP: %d\n", ret);
487  if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
488  ipoib_warn(priv, "unable to move qp to error state\n");
489  }
490  return 0;
491 
492 err_modify:
493  ib_destroy_qp(p->qp);
494 err_qp:
495  kfree(p);
496  return ret;
497 }
498 
499 static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
500  struct ib_cm_event *event)
501 {
502  struct ipoib_cm_rx *p;
503  struct ipoib_dev_priv *priv;
504 
505  switch (event->event) {
506  case IB_CM_REQ_RECEIVED:
507  return ipoib_cm_req_handler(cm_id, event);
508  case IB_CM_DREQ_RECEIVED:
509  p = cm_id->context;
510  ib_send_cm_drep(cm_id, NULL, 0);
511  /* Fall through */
512  case IB_CM_REJ_RECEIVED:
513  p = cm_id->context;
514  priv = netdev_priv(p->dev);
515  if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
516  ipoib_warn(priv, "unable to move qp to error state\n");
517  /* Fall through */
518  default:
519  return 0;
520  }
521 }
522 /* Adjust length of skb with fragments to match received data */
523 static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
524  unsigned int length, struct sk_buff *toskb)
525 {
526  int i, num_frags;
527  unsigned int size;
528 
529  /* put header into skb */
530  size = min(length, hdr_space);
531  skb->tail += size;
532  skb->len += size;
533  length -= size;
534 
535  num_frags = skb_shinfo(skb)->nr_frags;
536  for (i = 0; i < num_frags; i++) {
537  skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
538 
539  if (length == 0) {
540  /* don't need this page */
541  skb_fill_page_desc(toskb, i, skb_frag_page(frag),
542  0, PAGE_SIZE);
543  --skb_shinfo(skb)->nr_frags;
544  } else {
545  size = min(length, (unsigned) PAGE_SIZE);
546 
547  skb_frag_size_set(frag, size);
548  skb->data_len += size;
549  skb->truesize += size;
550  skb->len += size;
551  length -= size;
552  }
553  }
554 }
555 
556 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
557 {
558  struct ipoib_dev_priv *priv = netdev_priv(dev);
559  struct ipoib_cm_rx_buf *rx_ring;
560  unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
561  struct sk_buff *skb, *newskb;
562  struct ipoib_cm_rx *p;
563  unsigned long flags;
565  int frags;
566  int has_srq;
567  struct sk_buff *small_skb;
568 
569  ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
570  wr_id, wc->status);
571 
572  if (unlikely(wr_id >= ipoib_recvq_size)) {
573  if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
574  spin_lock_irqsave(&priv->lock, flags);
575  list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
576  ipoib_cm_start_rx_drain(priv);
577  queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
578  spin_unlock_irqrestore(&priv->lock, flags);
579  } else
580  ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
581  wr_id, ipoib_recvq_size);
582  return;
583  }
584 
585  p = wc->qp->qp_context;
586 
587  has_srq = ipoib_cm_has_srq(dev);
588  rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
589 
590  skb = rx_ring[wr_id].skb;
591 
592  if (unlikely(wc->status != IB_WC_SUCCESS)) {
593  ipoib_dbg(priv, "cm recv error "
594  "(status=%d, wrid=%d vend_err %x)\n",
595  wc->status, wr_id, wc->vendor_err);
596  ++dev->stats.rx_dropped;
597  if (has_srq)
598  goto repost;
599  else {
600  if (!--p->recv_count) {
601  spin_lock_irqsave(&priv->lock, flags);
602  list_move(&p->list, &priv->cm.rx_reap_list);
603  spin_unlock_irqrestore(&priv->lock, flags);
604  queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
605  }
606  return;
607  }
608  }
609 
610  if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
612  spin_lock_irqsave(&priv->lock, flags);
613  p->jiffies = jiffies;
614  /* Move this entry to list head, but do not re-add it
615  * if it has been moved out of list. */
616  if (p->state == IPOIB_CM_RX_LIVE)
617  list_move(&p->list, &priv->cm.passive_ids);
618  spin_unlock_irqrestore(&priv->lock, flags);
619  }
620  }
621 
622  if (wc->byte_len < IPOIB_CM_COPYBREAK) {
623  int dlen = wc->byte_len;
624 
625  small_skb = dev_alloc_skb(dlen + 12);
626  if (small_skb) {
627  skb_reserve(small_skb, 12);
628  ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
629  dlen, DMA_FROM_DEVICE);
630  skb_copy_from_linear_data(skb, small_skb->data, dlen);
631  ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
632  dlen, DMA_FROM_DEVICE);
633  skb_put(small_skb, dlen);
634  skb = small_skb;
635  goto copied;
636  }
637  }
638 
639  frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
640  (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
641 
642  newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping);
643  if (unlikely(!newskb)) {
644  /*
645  * If we can't allocate a new RX buffer, dump
646  * this packet and reuse the old buffer.
647  */
648  ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
649  ++dev->stats.rx_dropped;
650  goto repost;
651  }
652 
653  ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
654  memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
655 
656  ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
657  wc->byte_len, wc->slid);
658 
659  skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
660 
661 copied:
662  skb->protocol = ((struct ipoib_header *) skb->data)->proto;
663  skb_reset_mac_header(skb);
665 
666  ++dev->stats.rx_packets;
667  dev->stats.rx_bytes += skb->len;
668 
669  skb->dev = dev;
670  /* XXX get correct PACKET_ type here */
671  skb->pkt_type = PACKET_HOST;
672  netif_receive_skb(skb);
673 
674 repost:
675  if (has_srq) {
676  if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
677  ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
678  "for buf %d\n", wr_id);
679  } else {
680  if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
681  &priv->cm.rx_wr,
682  priv->cm.rx_sge,
683  wr_id))) {
684  --p->recv_count;
685  ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
686  "for buf %d\n", wr_id);
687  }
688  }
689 }
690 
691 static inline int post_send(struct ipoib_dev_priv *priv,
692  struct ipoib_cm_tx *tx,
693  unsigned int wr_id,
694  u64 addr, int len)
695 {
696  struct ib_send_wr *bad_wr;
697 
698  priv->tx_sge[0].addr = addr;
699  priv->tx_sge[0].length = len;
700 
701  priv->tx_wr.num_sge = 1;
702  priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
703 
704  return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
705 }
706 
707 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
708 {
709  struct ipoib_dev_priv *priv = netdev_priv(dev);
710  struct ipoib_cm_tx_buf *tx_req;
711  u64 addr;
712  int rc;
713 
714  if (unlikely(skb->len > tx->mtu)) {
715  ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
716  skb->len, tx->mtu);
717  ++dev->stats.tx_dropped;
718  ++dev->stats.tx_errors;
719  ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
720  return;
721  }
722 
723  ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
724  tx->tx_head, skb->len, tx->qp->qp_num);
725 
726  /*
727  * We put the skb into the tx_ring _before_ we call post_send()
728  * because it's entirely possible that the completion handler will
729  * run before we execute anything after the post_send(). That
730  * means we have to make sure everything is properly recorded and
731  * our state is consistent before we call post_send().
732  */
733  tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
734  tx_req->skb = skb;
735  addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
736  if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
737  ++dev->stats.tx_errors;
738  dev_kfree_skb_any(skb);
739  return;
740  }
741 
742  tx_req->mapping = addr;
743 
744  rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
745  addr, skb->len);
746  if (unlikely(rc)) {
747  ipoib_warn(priv, "post_send failed, error %d\n", rc);
748  ++dev->stats.tx_errors;
749  ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
750  dev_kfree_skb_any(skb);
751  } else {
752  dev->trans_start = jiffies;
753  ++tx->tx_head;
754 
755  if (++priv->tx_outstanding == ipoib_sendq_size) {
756  ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
757  tx->qp->qp_num);
758  if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
759  ipoib_warn(priv, "request notify on send CQ failed\n");
760  netif_stop_queue(dev);
761  }
762  }
763 }
764 
765 void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
766 {
767  struct ipoib_dev_priv *priv = netdev_priv(dev);
768  struct ipoib_cm_tx *tx = wc->qp->qp_context;
769  unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
770  struct ipoib_cm_tx_buf *tx_req;
771  unsigned long flags;
772 
773  ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
774  wr_id, wc->status);
775 
776  if (unlikely(wr_id >= ipoib_sendq_size)) {
777  ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
778  wr_id, ipoib_sendq_size);
779  return;
780  }
781 
782  tx_req = &tx->tx_ring[wr_id];
783 
784  ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
785 
786  /* FIXME: is this right? Shouldn't we only increment on success? */
787  ++dev->stats.tx_packets;
788  dev->stats.tx_bytes += tx_req->skb->len;
789 
790  dev_kfree_skb_any(tx_req->skb);
791 
792  netif_tx_lock(dev);
793 
794  ++tx->tx_tail;
795  if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
796  netif_queue_stopped(dev) &&
798  netif_wake_queue(dev);
799 
800  if (wc->status != IB_WC_SUCCESS &&
801  wc->status != IB_WC_WR_FLUSH_ERR) {
802  struct ipoib_neigh *neigh;
803 
804  ipoib_dbg(priv, "failed cm send event "
805  "(status=%d, wrid=%d vend_err %x)\n",
806  wc->status, wr_id, wc->vendor_err);
807 
808  spin_lock_irqsave(&priv->lock, flags);
809  neigh = tx->neigh;
810 
811  if (neigh) {
812  neigh->cm = NULL;
813  list_del(&neigh->list);
814  ipoib_neigh_free(neigh);
815 
816  tx->neigh = NULL;
817  }
818 
820  list_move(&tx->list, &priv->cm.reap_list);
821  queue_work(ipoib_workqueue, &priv->cm.reap_task);
822  }
823 
825 
826  spin_unlock_irqrestore(&priv->lock, flags);
827  }
828 
829  netif_tx_unlock(dev);
830 }
831 
833 {
834  struct ipoib_dev_priv *priv = netdev_priv(dev);
835  int ret;
836 
837  if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
838  return 0;
839 
840  priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
841  if (IS_ERR(priv->cm.id)) {
842  printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
843  ret = PTR_ERR(priv->cm.id);
844  goto err_cm;
845  }
846 
847  ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
848  0, NULL);
849  if (ret) {
850  printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
851  IPOIB_CM_IETF_ID | priv->qp->qp_num);
852  goto err_listen;
853  }
854 
855  return 0;
856 
857 err_listen:
858  ib_destroy_cm_id(priv->cm.id);
859 err_cm:
860  priv->cm.id = NULL;
861  return ret;
862 }
863 
864 static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
865 {
866  struct ipoib_dev_priv *priv = netdev_priv(dev);
867  struct ipoib_cm_rx *rx, *n;
868  LIST_HEAD(list);
869 
870  spin_lock_irq(&priv->lock);
871  list_splice_init(&priv->cm.rx_reap_list, &list);
872  spin_unlock_irq(&priv->lock);
873 
875  ib_destroy_cm_id(rx->id);
876  ib_destroy_qp(rx->qp);
877  if (!ipoib_cm_has_srq(dev)) {
878  ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
879  spin_lock_irq(&priv->lock);
880  --priv->cm.nonsrq_conn_qp;
881  spin_unlock_irq(&priv->lock);
882  }
883  kfree(rx);
884  }
885 }
886 
887 void ipoib_cm_dev_stop(struct net_device *dev)
888 {
889  struct ipoib_dev_priv *priv = netdev_priv(dev);
890  struct ipoib_cm_rx *p;
891  unsigned long begin;
892  int ret;
893 
894  if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
895  return;
896 
897  ib_destroy_cm_id(priv->cm.id);
898  priv->cm.id = NULL;
899 
900  spin_lock_irq(&priv->lock);
901  while (!list_empty(&priv->cm.passive_ids)) {
902  p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
903  list_move(&p->list, &priv->cm.rx_error_list);
905  spin_unlock_irq(&priv->lock);
906  ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
907  if (ret)
908  ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
909  spin_lock_irq(&priv->lock);
910  }
911 
912  /* Wait for all RX to be drained */
913  begin = jiffies;
914 
915  while (!list_empty(&priv->cm.rx_error_list) ||
916  !list_empty(&priv->cm.rx_flush_list) ||
917  !list_empty(&priv->cm.rx_drain_list)) {
918  if (time_after(jiffies, begin + 5 * HZ)) {
919  ipoib_warn(priv, "RX drain timing out\n");
920 
921  /*
922  * assume the HW is wedged and just free up everything.
923  */
924  list_splice_init(&priv->cm.rx_flush_list,
925  &priv->cm.rx_reap_list);
926  list_splice_init(&priv->cm.rx_error_list,
927  &priv->cm.rx_reap_list);
928  list_splice_init(&priv->cm.rx_drain_list,
929  &priv->cm.rx_reap_list);
930  break;
931  }
932  spin_unlock_irq(&priv->lock);
933  msleep(1);
934  ipoib_drain_cq(dev);
935  spin_lock_irq(&priv->lock);
936  }
937 
938  spin_unlock_irq(&priv->lock);
939 
940  ipoib_cm_free_rx_reap_list(dev);
941 
942  cancel_delayed_work(&priv->cm.stale_task);
943 }
944 
945 static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
946 {
947  struct ipoib_cm_tx *p = cm_id->context;
948  struct ipoib_dev_priv *priv = netdev_priv(p->dev);
949  struct ipoib_cm_data *data = event->private_data;
950  struct sk_buff_head skqueue;
951  struct ib_qp_attr qp_attr;
952  int qp_attr_mask, ret;
953  struct sk_buff *skb;
954 
955  p->mtu = be32_to_cpu(data->mtu);
956 
957  if (p->mtu <= IPOIB_ENCAP_LEN) {
958  ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
959  p->mtu, IPOIB_ENCAP_LEN);
960  return -EINVAL;
961  }
962 
963  qp_attr.qp_state = IB_QPS_RTR;
964  ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
965  if (ret) {
966  ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
967  return ret;
968  }
969 
970  qp_attr.rq_psn = 0 /* FIXME */;
971  ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
972  if (ret) {
973  ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
974  return ret;
975  }
976 
977  qp_attr.qp_state = IB_QPS_RTS;
978  ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
979  if (ret) {
980  ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
981  return ret;
982  }
983  ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
984  if (ret) {
985  ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
986  return ret;
987  }
988 
989  skb_queue_head_init(&skqueue);
990 
991  spin_lock_irq(&priv->lock);
993  if (p->neigh)
994  while ((skb = __skb_dequeue(&p->neigh->queue)))
995  __skb_queue_tail(&skqueue, skb);
996  spin_unlock_irq(&priv->lock);
997 
998  while ((skb = __skb_dequeue(&skqueue))) {
999  skb->dev = p->dev;
1000  if (dev_queue_xmit(skb))
1001  ipoib_warn(priv, "dev_queue_xmit failed "
1002  "to requeue packet\n");
1003  }
1004 
1005  ret = ib_send_cm_rtu(cm_id, NULL, 0);
1006  if (ret) {
1007  ipoib_warn(priv, "failed to send RTU: %d\n", ret);
1008  return ret;
1009  }
1010  return 0;
1011 }
1012 
1013 static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
1014 {
1015  struct ipoib_dev_priv *priv = netdev_priv(dev);
1016  struct ib_qp_init_attr attr = {
1017  .send_cq = priv->recv_cq,
1018  .recv_cq = priv->recv_cq,
1019  .srq = priv->cm.srq,
1020  .cap.max_send_wr = ipoib_sendq_size,
1021  .cap.max_send_sge = 1,
1022  .sq_sig_type = IB_SIGNAL_ALL_WR,
1023  .qp_type = IB_QPT_RC,
1024  .qp_context = tx
1025  };
1026 
1027  return ib_create_qp(priv->pd, &attr);
1028 }
1029 
1030 static int ipoib_cm_send_req(struct net_device *dev,
1031  struct ib_cm_id *id, struct ib_qp *qp,
1032  u32 qpn,
1033  struct ib_sa_path_rec *pathrec)
1034 {
1035  struct ipoib_dev_priv *priv = netdev_priv(dev);
1036  struct ipoib_cm_data data = {};
1037  struct ib_cm_req_param req = {};
1038 
1039  data.qpn = cpu_to_be32(priv->qp->qp_num);
1041 
1042  req.primary_path = pathrec;
1043  req.alternate_path = NULL;
1045  req.qp_num = qp->qp_num;
1046  req.qp_type = qp->qp_type;
1047  req.private_data = &data;
1048  req.private_data_len = sizeof data;
1049  req.flow_control = 0;
1050 
1051  req.starting_psn = 0; /* FIXME */
1052 
1053  /*
1054  * Pick some arbitrary defaults here; we could make these
1055  * module parameters if anyone cared about setting them.
1056  */
1057  req.responder_resources = 4;
1058  req.remote_cm_response_timeout = 20;
1059  req.local_cm_response_timeout = 20;
1060  req.retry_count = 0; /* RFC draft warns against retries */
1061  req.rnr_retry_count = 0; /* RFC draft warns against retries */
1062  req.max_cm_retries = 15;
1063  req.srq = ipoib_cm_has_srq(dev);
1064  return ib_send_cm_req(id, &req);
1065 }
1066 
1067 static int ipoib_cm_modify_tx_init(struct net_device *dev,
1068  struct ib_cm_id *cm_id, struct ib_qp *qp)
1069 {
1070  struct ipoib_dev_priv *priv = netdev_priv(dev);
1071  struct ib_qp_attr qp_attr;
1072  int qp_attr_mask, ret;
1073  ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
1074  if (ret) {
1075  ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
1076  return ret;
1077  }
1078 
1079  qp_attr.qp_state = IB_QPS_INIT;
1080  qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
1081  qp_attr.port_num = priv->port;
1083 
1084  ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
1085  if (ret) {
1086  ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
1087  return ret;
1088  }
1089  return 0;
1090 }
1091 
1092 static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
1093  struct ib_sa_path_rec *pathrec)
1094 {
1095  struct ipoib_dev_priv *priv = netdev_priv(p->dev);
1096  int ret;
1097 
1098  p->tx_ring = vzalloc(ipoib_sendq_size * sizeof *p->tx_ring);
1099  if (!p->tx_ring) {
1100  ipoib_warn(priv, "failed to allocate tx ring\n");
1101  ret = -ENOMEM;
1102  goto err_tx;
1103  }
1104 
1105  p->qp = ipoib_cm_create_tx_qp(p->dev, p);
1106  if (IS_ERR(p->qp)) {
1107  ret = PTR_ERR(p->qp);
1108  ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
1109  goto err_qp;
1110  }
1111 
1112  p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
1113  if (IS_ERR(p->id)) {
1114  ret = PTR_ERR(p->id);
1115  ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
1116  goto err_id;
1117  }
1118 
1119  ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp);
1120  if (ret) {
1121  ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
1122  goto err_modify;
1123  }
1124 
1125  ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
1126  if (ret) {
1127  ipoib_warn(priv, "failed to send cm req: %d\n", ret);
1128  goto err_send_cm;
1129  }
1130 
1131  ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
1132  p->qp->qp_num, pathrec->dgid.raw, qpn);
1133 
1134  return 0;
1135 
1136 err_send_cm:
1137 err_modify:
1138  ib_destroy_cm_id(p->id);
1139 err_id:
1140  p->id = NULL;
1141  ib_destroy_qp(p->qp);
1142 err_qp:
1143  p->qp = NULL;
1144  vfree(p->tx_ring);
1145 err_tx:
1146  return ret;
1147 }
1148 
1149 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
1150 {
1151  struct ipoib_dev_priv *priv = netdev_priv(p->dev);
1152  struct ipoib_cm_tx_buf *tx_req;
1153  unsigned long begin;
1154 
1155  ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
1156  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
1157 
1158  if (p->id)
1159  ib_destroy_cm_id(p->id);
1160 
1161  if (p->tx_ring) {
1162  /* Wait for all sends to complete */
1163  begin = jiffies;
1164  while ((int) p->tx_tail - (int) p->tx_head < 0) {
1165  if (time_after(jiffies, begin + 5 * HZ)) {
1166  ipoib_warn(priv, "timing out; %d sends not completed\n",
1167  p->tx_head - p->tx_tail);
1168  goto timeout;
1169  }
1170 
1171  msleep(1);
1172  }
1173  }
1174 
1175 timeout:
1176 
1177  while ((int) p->tx_tail - (int) p->tx_head < 0) {
1178  tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
1179  ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
1180  DMA_TO_DEVICE);
1181  dev_kfree_skb_any(tx_req->skb);
1182  ++p->tx_tail;
1183  netif_tx_lock_bh(p->dev);
1184  if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
1185  netif_queue_stopped(p->dev) &&
1187  netif_wake_queue(p->dev);
1188  netif_tx_unlock_bh(p->dev);
1189  }
1190 
1191  if (p->qp)
1192  ib_destroy_qp(p->qp);
1193 
1194  vfree(p->tx_ring);
1195  kfree(p);
1196 }
1197 
1198 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
1199  struct ib_cm_event *event)
1200 {
1201  struct ipoib_cm_tx *tx = cm_id->context;
1202  struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
1203  struct net_device *dev = priv->dev;
1204  struct ipoib_neigh *neigh;
1205  unsigned long flags;
1206  int ret;
1207 
1208  switch (event->event) {
1209  case IB_CM_DREQ_RECEIVED:
1210  ipoib_dbg(priv, "DREQ received.\n");
1211  ib_send_cm_drep(cm_id, NULL, 0);
1212  break;
1213  case IB_CM_REP_RECEIVED:
1214  ipoib_dbg(priv, "REP received.\n");
1215  ret = ipoib_cm_rep_handler(cm_id, event);
1216  if (ret)
1218  NULL, 0, NULL, 0);
1219  break;
1220  case IB_CM_REQ_ERROR:
1221  case IB_CM_REJ_RECEIVED:
1222  case IB_CM_TIMEWAIT_EXIT:
1223  ipoib_dbg(priv, "CM error %d.\n", event->event);
1224  netif_tx_lock_bh(dev);
1225  spin_lock_irqsave(&priv->lock, flags);
1226  neigh = tx->neigh;
1227 
1228  if (neigh) {
1229  neigh->cm = NULL;
1230  list_del(&neigh->list);
1231  ipoib_neigh_free(neigh);
1232 
1233  tx->neigh = NULL;
1234  }
1235 
1237  list_move(&tx->list, &priv->cm.reap_list);
1238  queue_work(ipoib_workqueue, &priv->cm.reap_task);
1239  }
1240 
1241  spin_unlock_irqrestore(&priv->lock, flags);
1242  netif_tx_unlock_bh(dev);
1243  break;
1244  default:
1245  break;
1246  }
1247 
1248  return 0;
1249 }
1250 
1252  struct ipoib_neigh *neigh)
1253 {
1254  struct ipoib_dev_priv *priv = netdev_priv(dev);
1255  struct ipoib_cm_tx *tx;
1256 
1257  tx = kzalloc(sizeof *tx, GFP_ATOMIC);
1258  if (!tx)
1259  return NULL;
1260 
1261  neigh->cm = tx;
1262  tx->neigh = neigh;
1263  tx->path = path;
1264  tx->dev = dev;
1265  list_add(&tx->list, &priv->cm.start_list);
1267  queue_work(ipoib_workqueue, &priv->cm.start_task);
1268  return tx;
1269 }
1270 
1272 {
1273  struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
1274  unsigned long flags;
1276  spin_lock_irqsave(&priv->lock, flags);
1277  list_move(&tx->list, &priv->cm.reap_list);
1278  queue_work(ipoib_workqueue, &priv->cm.reap_task);
1279  ipoib_dbg(priv, "Reap connection for gid %pI6\n",
1280  tx->neigh->daddr + 4);
1281  tx->neigh = NULL;
1282  spin_unlock_irqrestore(&priv->lock, flags);
1283  }
1284 }
1285 
1286 static void ipoib_cm_tx_start(struct work_struct *work)
1287 {
1288  struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1289  cm.start_task);
1290  struct net_device *dev = priv->dev;
1291  struct ipoib_neigh *neigh;
1292  struct ipoib_cm_tx *p;
1293  unsigned long flags;
1294  int ret;
1295 
1296  struct ib_sa_path_rec pathrec;
1297  u32 qpn;
1298 
1299  netif_tx_lock_bh(dev);
1300  spin_lock_irqsave(&priv->lock, flags);
1301 
1302  while (!list_empty(&priv->cm.start_list)) {
1303  p = list_entry(priv->cm.start_list.next, typeof(*p), list);
1304  list_del_init(&p->list);
1305  neigh = p->neigh;
1306  qpn = IPOIB_QPN(neigh->daddr);
1307  memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
1308 
1309  spin_unlock_irqrestore(&priv->lock, flags);
1310  netif_tx_unlock_bh(dev);
1311 
1312  ret = ipoib_cm_tx_init(p, qpn, &pathrec);
1313 
1314  netif_tx_lock_bh(dev);
1315  spin_lock_irqsave(&priv->lock, flags);
1316 
1317  if (ret) {
1318  neigh = p->neigh;
1319  if (neigh) {
1320  neigh->cm = NULL;
1321  list_del(&neigh->list);
1322  ipoib_neigh_free(neigh);
1323  }
1324  list_del(&p->list);
1325  kfree(p);
1326  }
1327  }
1328 
1329  spin_unlock_irqrestore(&priv->lock, flags);
1330  netif_tx_unlock_bh(dev);
1331 }
1332 
1333 static void ipoib_cm_tx_reap(struct work_struct *work)
1334 {
1335  struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1336  cm.reap_task);
1337  struct net_device *dev = priv->dev;
1338  struct ipoib_cm_tx *p;
1339  unsigned long flags;
1340 
1341  netif_tx_lock_bh(dev);
1342  spin_lock_irqsave(&priv->lock, flags);
1343 
1344  while (!list_empty(&priv->cm.reap_list)) {
1345  p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
1346  list_del(&p->list);
1347  spin_unlock_irqrestore(&priv->lock, flags);
1348  netif_tx_unlock_bh(dev);
1349  ipoib_cm_tx_destroy(p);
1350  netif_tx_lock_bh(dev);
1351  spin_lock_irqsave(&priv->lock, flags);
1352  }
1353 
1354  spin_unlock_irqrestore(&priv->lock, flags);
1355  netif_tx_unlock_bh(dev);
1356 }
1357 
1358 static void ipoib_cm_skb_reap(struct work_struct *work)
1359 {
1360  struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1361  cm.skb_task);
1362  struct net_device *dev = priv->dev;
1363  struct sk_buff *skb;
1364  unsigned long flags;
1365  unsigned mtu = priv->mcast_mtu;
1366 
1367  netif_tx_lock_bh(dev);
1368  spin_lock_irqsave(&priv->lock, flags);
1369 
1370  while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
1371  spin_unlock_irqrestore(&priv->lock, flags);
1372  netif_tx_unlock_bh(dev);
1373 
1374  if (skb->protocol == htons(ETH_P_IP))
1376 #if IS_ENABLED(CONFIG_IPV6)
1377  else if (skb->protocol == htons(ETH_P_IPV6))
1378  icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1379 #endif
1380  dev_kfree_skb_any(skb);
1381 
1382  netif_tx_lock_bh(dev);
1383  spin_lock_irqsave(&priv->lock, flags);
1384  }
1385 
1386  spin_unlock_irqrestore(&priv->lock, flags);
1387  netif_tx_unlock_bh(dev);
1388 }
1389 
1390 void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
1391  unsigned int mtu)
1392 {
1393  struct ipoib_dev_priv *priv = netdev_priv(dev);
1394  int e = skb_queue_empty(&priv->cm.skb_queue);
1395 
1396  if (skb_dst(skb))
1397  skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
1398 
1399  skb_queue_tail(&priv->cm.skb_queue, skb);
1400  if (e)
1401  queue_work(ipoib_workqueue, &priv->cm.skb_task);
1402 }
1403 
1404 static void ipoib_cm_rx_reap(struct work_struct *work)
1405 {
1406  ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
1407  cm.rx_reap_task)->dev);
1408 }
1409 
1410 static void ipoib_cm_stale_task(struct work_struct *work)
1411 {
1412  struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1413  cm.stale_task.work);
1414  struct ipoib_cm_rx *p;
1415  int ret;
1416 
1417  spin_lock_irq(&priv->lock);
1418  while (!list_empty(&priv->cm.passive_ids)) {
1419  /* List is sorted by LRU, start from tail,
1420  * stop when we see a recently used entry */
1421  p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
1423  break;
1424  list_move(&p->list, &priv->cm.rx_error_list);
1425  p->state = IPOIB_CM_RX_ERROR;
1426  spin_unlock_irq(&priv->lock);
1427  ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
1428  if (ret)
1429  ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
1430  spin_lock_irq(&priv->lock);
1431  }
1432 
1433  if (!list_empty(&priv->cm.passive_ids))
1435  &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
1436  spin_unlock_irq(&priv->lock);
1437 }
1438 
1439 
1440 static ssize_t show_mode(struct device *d, struct device_attribute *attr,
1441  char *buf)
1442 {
1443  struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d));
1444 
1445  if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
1446  return sprintf(buf, "connected\n");
1447  else
1448  return sprintf(buf, "datagram\n");
1449 }
1450 
1451 static ssize_t set_mode(struct device *d, struct device_attribute *attr,
1452  const char *buf, size_t count)
1453 {
1454  struct net_device *dev = to_net_dev(d);
1455  int ret;
1456 
1457  if (!rtnl_trylock())
1458  return restart_syscall();
1459 
1460  ret = ipoib_set_mode(dev, buf);
1461 
1462  rtnl_unlock();
1463 
1464  if (!ret)
1465  return count;
1466 
1467  return ret;
1468 }
1469 
1470 static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode);
1471 
1473 {
1474  return device_create_file(&dev->dev, &dev_attr_mode);
1475 }
1476 
1477 static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
1478 {
1479  struct ipoib_dev_priv *priv = netdev_priv(dev);
1480  struct ib_srq_init_attr srq_init_attr = {
1482  .attr = {
1483  .max_wr = ipoib_recvq_size,
1484  .max_sge = max_sge
1485  }
1486  };
1487 
1488  priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
1489  if (IS_ERR(priv->cm.srq)) {
1490  if (PTR_ERR(priv->cm.srq) != -ENOSYS)
1491  printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
1492  priv->ca->name, PTR_ERR(priv->cm.srq));
1493  priv->cm.srq = NULL;
1494  return;
1495  }
1496 
1497  priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring);
1498  if (!priv->cm.srq_ring) {
1499  printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
1500  priv->ca->name, ipoib_recvq_size);
1501  ib_destroy_srq(priv->cm.srq);
1502  priv->cm.srq = NULL;
1503  return;
1504  }
1505 
1506 }
1507 
1509 {
1510  struct ipoib_dev_priv *priv = netdev_priv(dev);
1511  int i, ret;
1512  struct ib_device_attr attr;
1513 
1514  INIT_LIST_HEAD(&priv->cm.passive_ids);
1515  INIT_LIST_HEAD(&priv->cm.reap_list);
1516  INIT_LIST_HEAD(&priv->cm.start_list);
1517  INIT_LIST_HEAD(&priv->cm.rx_error_list);
1518  INIT_LIST_HEAD(&priv->cm.rx_flush_list);
1519  INIT_LIST_HEAD(&priv->cm.rx_drain_list);
1520  INIT_LIST_HEAD(&priv->cm.rx_reap_list);
1521  INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
1522  INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
1523  INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
1524  INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
1525  INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
1526 
1527  skb_queue_head_init(&priv->cm.skb_queue);
1528 
1529  ret = ib_query_device(priv->ca, &attr);
1530  if (ret) {
1531  printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
1532  return ret;
1533  }
1534 
1535  ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
1536 
1537  attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
1538  ipoib_cm_create_srq(dev, attr.max_srq_sge);
1539  if (ipoib_cm_has_srq(dev)) {
1540  priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
1541  priv->cm.num_frags = attr.max_srq_sge;
1542  ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
1543  priv->cm.max_cm_mtu, priv->cm.num_frags);
1544  } else {
1545  priv->cm.max_cm_mtu = IPOIB_CM_MTU;
1546  priv->cm.num_frags = IPOIB_CM_RX_SG;
1547  }
1548 
1549  ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
1550 
1551  if (ipoib_cm_has_srq(dev)) {
1552  for (i = 0; i < ipoib_recvq_size; ++i) {
1553  if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
1554  priv->cm.num_frags - 1,
1555  priv->cm.srq_ring[i].mapping)) {
1556  ipoib_warn(priv, "failed to allocate "
1557  "receive buffer %d\n", i);
1558  ipoib_cm_dev_cleanup(dev);
1559  return -ENOMEM;
1560  }
1561 
1562  if (ipoib_cm_post_receive_srq(dev, i)) {
1563  ipoib_warn(priv, "ipoib_cm_post_receive_srq "
1564  "failed for buf %d\n", i);
1565  ipoib_cm_dev_cleanup(dev);
1566  return -EIO;
1567  }
1568  }
1569  }
1570 
1571  priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
1572  return 0;
1573 }
1574 
1576 {
1577  struct ipoib_dev_priv *priv = netdev_priv(dev);
1578  int ret;
1579 
1580  if (!priv->cm.srq)
1581  return;
1582 
1583  ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
1584 
1585  ret = ib_destroy_srq(priv->cm.srq);
1586  if (ret)
1587  ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
1588 
1589  priv->cm.srq = NULL;
1590  if (!priv->cm.srq_ring)
1591  return;
1592 
1593  ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
1594  priv->cm.srq_ring = NULL;
1595 }