Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
inet_lro.c
Go to the documentation of this file.
1 /*
2  * linux/net/ipv4/inet_lro.c
3  *
4  * Large Receive Offload (ipv4 / tcp)
5  *
6  * (C) Copyright IBM Corp. 2007
7  *
8  * Authors:
9  * Jan-Bernd Themann <[email protected]>
10  * Christoph Raisch <[email protected]>
11  *
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2, or (at your option)
16  * any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with this program; if not, write to the Free Software
25  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26  */
27 
28 
29 #include <linux/module.h>
30 #include <linux/if_vlan.h>
31 #include <linux/inet_lro.h>
32 
33 MODULE_LICENSE("GPL");
34 MODULE_AUTHOR("Jan-Bernd Themann <[email protected]>");
35 MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
36 
37 #define TCP_HDR_LEN(tcph) (tcph->doff << 2)
38 #define IP_HDR_LEN(iph) (iph->ihl << 2)
39 #define TCP_PAYLOAD_LENGTH(iph, tcph) \
40  (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
41 
42 #define IPH_LEN_WO_OPTIONS 5
43 #define TCPH_LEN_WO_OPTIONS 5
44 #define TCPH_LEN_W_TIMESTAMP 8
45 
46 #define LRO_MAX_PG_HLEN 64
47 
48 #define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
49 
50 /*
51  * Basic tcp checks whether packet is suitable for LRO
52  */
53 
54 static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
55  int len, const struct net_lro_desc *lro_desc)
56 {
57  /* check ip header: don't aggregate padded frames */
58  if (ntohs(iph->tot_len) != len)
59  return -1;
60 
61  if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
62  return -1;
63 
64  if (iph->ihl != IPH_LEN_WO_OPTIONS)
65  return -1;
66 
67  if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
68  tcph->rst || tcph->syn || tcph->fin)
69  return -1;
70 
71  if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
72  return -1;
73 
74  if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
75  tcph->doff != TCPH_LEN_W_TIMESTAMP)
76  return -1;
77 
78  /* check tcp options (only timestamp allowed) */
79  if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
80  __be32 *topt = (__be32 *)(tcph + 1);
81 
82  if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
83  | (TCPOPT_TIMESTAMP << 8)
85  return -1;
86 
87  /* timestamp should be in right order */
88  topt++;
89  if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
90  ntohl(*topt)))
91  return -1;
92 
93  /* timestamp reply should not be zero */
94  topt++;
95  if (*topt == 0)
96  return -1;
97  }
98 
99  return 0;
100 }
101 
102 static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
103 {
104  struct iphdr *iph = lro_desc->iph;
105  struct tcphdr *tcph = lro_desc->tcph;
106  __be32 *p;
107  __wsum tcp_hdr_csum;
108 
109  tcph->ack_seq = lro_desc->tcp_ack;
110  tcph->window = lro_desc->tcp_window;
111 
112  if (lro_desc->tcp_saw_tstamp) {
113  p = (__be32 *)(tcph + 1);
114  *(p+2) = lro_desc->tcp_rcv_tsecr;
115  }
116 
117  iph->tot_len = htons(lro_desc->ip_tot_len);
118 
119  iph->check = 0;
120  iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
121 
122  tcph->check = 0;
123  tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
124  lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
125  tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
126  lro_desc->ip_tot_len -
127  IP_HDR_LEN(iph), IPPROTO_TCP,
128  lro_desc->data_csum);
129 }
130 
131 static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
132 {
133  __wsum tcp_csum;
134  __wsum tcp_hdr_csum;
135  __wsum tcp_ps_hdr_csum;
136 
137  tcp_csum = ~csum_unfold(tcph->check);
138  tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
139 
140  tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
141  len + TCP_HDR_LEN(tcph),
142  IPPROTO_TCP, 0);
143 
144  return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
145  tcp_ps_hdr_csum);
146 }
147 
148 static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
149  struct iphdr *iph, struct tcphdr *tcph)
150 {
151  int nr_frags;
152  __be32 *ptr;
153  u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
154 
155  nr_frags = skb_shinfo(skb)->nr_frags;
156  lro_desc->parent = skb;
157  lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
158  lro_desc->iph = iph;
159  lro_desc->tcph = tcph;
160  lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
161  lro_desc->tcp_ack = tcph->ack_seq;
162  lro_desc->tcp_window = tcph->window;
163 
164  lro_desc->pkt_aggr_cnt = 1;
165  lro_desc->ip_tot_len = ntohs(iph->tot_len);
166 
167  if (tcph->doff == 8) {
168  ptr = (__be32 *)(tcph+1);
169  lro_desc->tcp_saw_tstamp = 1;
170  lro_desc->tcp_rcv_tsval = *(ptr+1);
171  lro_desc->tcp_rcv_tsecr = *(ptr+2);
172  }
173 
174  lro_desc->mss = tcp_data_len;
175  lro_desc->active = 1;
176 
177  lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
178  tcp_data_len);
179 }
180 
181 static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
182 {
183  memset(lro_desc, 0, sizeof(struct net_lro_desc));
184 }
185 
186 static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
187  struct tcphdr *tcph, int tcp_data_len)
188 {
189  struct sk_buff *parent = lro_desc->parent;
190  __be32 *topt;
191 
192  lro_desc->pkt_aggr_cnt++;
193  lro_desc->ip_tot_len += tcp_data_len;
194  lro_desc->tcp_next_seq += tcp_data_len;
195  lro_desc->tcp_window = tcph->window;
196  lro_desc->tcp_ack = tcph->ack_seq;
197 
198  /* don't update tcp_rcv_tsval, would not work with PAWS */
199  if (lro_desc->tcp_saw_tstamp) {
200  topt = (__be32 *) (tcph + 1);
201  lro_desc->tcp_rcv_tsecr = *(topt + 2);
202  }
203 
204  lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
205  lro_tcp_data_csum(iph, tcph,
206  tcp_data_len),
207  parent->len);
208 
209  parent->len += tcp_data_len;
210  parent->data_len += tcp_data_len;
211  if (tcp_data_len > lro_desc->mss)
212  lro_desc->mss = tcp_data_len;
213 }
214 
215 static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
216  struct iphdr *iph, struct tcphdr *tcph)
217 {
218  struct sk_buff *parent = lro_desc->parent;
219  int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
220 
221  lro_add_common(lro_desc, iph, tcph, tcp_data_len);
222 
223  skb_pull(skb, (skb->len - tcp_data_len));
224  parent->truesize += skb->truesize;
225 
226  if (lro_desc->last_skb)
227  lro_desc->last_skb->next = skb;
228  else
229  skb_shinfo(parent)->frag_list = skb;
230 
231  lro_desc->last_skb = skb;
232 }
233 
234 static void lro_add_frags(struct net_lro_desc *lro_desc,
235  int len, int hlen, int truesize,
236  struct skb_frag_struct *skb_frags,
237  struct iphdr *iph, struct tcphdr *tcph)
238 {
239  struct sk_buff *skb = lro_desc->parent;
240  int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
241 
242  lro_add_common(lro_desc, iph, tcph, tcp_data_len);
243 
244  skb->truesize += truesize;
245 
246  skb_frags[0].page_offset += hlen;
247  skb_frag_size_sub(&skb_frags[0], hlen);
248 
249  while (tcp_data_len > 0) {
250  *(lro_desc->next_frag) = *skb_frags;
251  tcp_data_len -= skb_frag_size(skb_frags);
252  lro_desc->next_frag++;
253  skb_frags++;
254  skb_shinfo(skb)->nr_frags++;
255  }
256 }
257 
258 static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
259  struct iphdr *iph,
260  struct tcphdr *tcph)
261 {
262  if ((lro_desc->iph->saddr != iph->saddr) ||
263  (lro_desc->iph->daddr != iph->daddr) ||
264  (lro_desc->tcph->source != tcph->source) ||
265  (lro_desc->tcph->dest != tcph->dest))
266  return -1;
267  return 0;
268 }
269 
270 static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
271  struct net_lro_desc *lro_arr,
272  struct iphdr *iph,
273  struct tcphdr *tcph)
274 {
275  struct net_lro_desc *lro_desc = NULL;
276  struct net_lro_desc *tmp;
277  int max_desc = lro_mgr->max_desc;
278  int i;
279 
280  for (i = 0; i < max_desc; i++) {
281  tmp = &lro_arr[i];
282  if (tmp->active)
283  if (!lro_check_tcp_conn(tmp, iph, tcph)) {
284  lro_desc = tmp;
285  goto out;
286  }
287  }
288 
289  for (i = 0; i < max_desc; i++) {
290  if (!lro_arr[i].active) {
291  lro_desc = &lro_arr[i];
292  goto out;
293  }
294  }
295 
296  LRO_INC_STATS(lro_mgr, no_desc);
297 out:
298  return lro_desc;
299 }
300 
301 static void lro_flush(struct net_lro_mgr *lro_mgr,
302  struct net_lro_desc *lro_desc)
303 {
304  if (lro_desc->pkt_aggr_cnt > 1)
305  lro_update_tcp_ip_header(lro_desc);
306 
307  skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
308 
309  if (lro_mgr->features & LRO_F_NAPI)
310  netif_receive_skb(lro_desc->parent);
311  else
312  netif_rx(lro_desc->parent);
313 
314  LRO_INC_STATS(lro_mgr, flushed);
315  lro_clear_desc(lro_desc);
316 }
317 
318 static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
319  void *priv)
320 {
321  struct net_lro_desc *lro_desc;
322  struct iphdr *iph;
323  struct tcphdr *tcph;
324  u64 flags;
325  int vlan_hdr_len = 0;
326 
327  if (!lro_mgr->get_skb_header ||
328  lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
329  &flags, priv))
330  goto out;
331 
332  if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
333  goto out;
334 
335  lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
336  if (!lro_desc)
337  goto out;
338 
339  if ((skb->protocol == htons(ETH_P_8021Q)) &&
340  !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
341  vlan_hdr_len = VLAN_HLEN;
342 
343  if (!lro_desc->active) { /* start new lro session */
344  if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
345  goto out;
346 
347  skb->ip_summed = lro_mgr->ip_summed_aggr;
348  lro_init_desc(lro_desc, skb, iph, tcph);
349  LRO_INC_STATS(lro_mgr, aggregated);
350  return 0;
351  }
352 
353  if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
354  goto out2;
355 
356  if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
357  goto out2;
358 
359  lro_add_packet(lro_desc, skb, iph, tcph);
360  LRO_INC_STATS(lro_mgr, aggregated);
361 
362  if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
363  lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
364  lro_flush(lro_mgr, lro_desc);
365 
366  return 0;
367 
368 out2: /* send aggregated SKBs to stack */
369  lro_flush(lro_mgr, lro_desc);
370 
371 out:
372  return 1;
373 }
374 
375 
376 static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
377  struct skb_frag_struct *frags,
378  int len, int true_size,
379  void *mac_hdr,
380  int hlen, __wsum sum,
381  u32 ip_summed)
382 {
383  struct sk_buff *skb;
384  struct skb_frag_struct *skb_frags;
385  int data_len = len;
386  int hdr_len = min(len, hlen);
387 
388  skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad);
389  if (!skb)
390  return NULL;
391 
392  skb_reserve(skb, lro_mgr->frag_align_pad);
393  skb->len = len;
394  skb->data_len = len - hdr_len;
395  skb->truesize += true_size;
396  skb->tail += hdr_len;
397 
398  memcpy(skb->data, mac_hdr, hdr_len);
399 
400  skb_frags = skb_shinfo(skb)->frags;
401  while (data_len > 0) {
402  *skb_frags = *frags;
403  data_len -= skb_frag_size(frags);
404  skb_frags++;
405  frags++;
406  skb_shinfo(skb)->nr_frags++;
407  }
408 
409  skb_shinfo(skb)->frags[0].page_offset += hdr_len;
410  skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len);
411 
412  skb->ip_summed = ip_summed;
413  skb->csum = sum;
414  skb->protocol = eth_type_trans(skb, lro_mgr->dev);
415  return skb;
416 }
417 
418 static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
419  struct skb_frag_struct *frags,
420  int len, int true_size,
421  void *priv, __wsum sum)
422 {
423  struct net_lro_desc *lro_desc;
424  struct iphdr *iph;
425  struct tcphdr *tcph;
426  struct sk_buff *skb;
427  u64 flags;
428  void *mac_hdr;
429  int mac_hdr_len;
430  int hdr_len = LRO_MAX_PG_HLEN;
431  int vlan_hdr_len = 0;
432 
433  if (!lro_mgr->get_frag_header ||
434  lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
435  (void *)&tcph, &flags, priv)) {
436  mac_hdr = skb_frag_address(frags);
437  goto out1;
438  }
439 
440  if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
441  goto out1;
442 
443  hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr);
444  mac_hdr_len = (int)((void *)(iph) - mac_hdr);
445 
446  lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
447  if (!lro_desc)
448  goto out1;
449 
450  if (!lro_desc->active) { /* start new lro session */
451  if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL))
452  goto out1;
453 
454  skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
455  hdr_len, 0, lro_mgr->ip_summed_aggr);
456  if (!skb)
457  goto out;
458 
459  if ((skb->protocol == htons(ETH_P_8021Q)) &&
460  !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
461  vlan_hdr_len = VLAN_HLEN;
462 
463  iph = (void *)(skb->data + vlan_hdr_len);
464  tcph = (void *)((u8 *)skb->data + vlan_hdr_len
465  + IP_HDR_LEN(iph));
466 
467  lro_init_desc(lro_desc, skb, iph, tcph);
468  LRO_INC_STATS(lro_mgr, aggregated);
469  return NULL;
470  }
471 
472  if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
473  goto out2;
474 
475  if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc))
476  goto out2;
477 
478  lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph);
479  LRO_INC_STATS(lro_mgr, aggregated);
480 
481  if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) ||
482  lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
483  lro_flush(lro_mgr, lro_desc);
484 
485  return NULL;
486 
487 out2: /* send aggregated packets to the stack */
488  lro_flush(lro_mgr, lro_desc);
489 
490 out1: /* Original packet has to be posted to the stack */
491  skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
492  hdr_len, sum, lro_mgr->ip_summed);
493 out:
494  return skb;
495 }
496 
497 void lro_receive_skb(struct net_lro_mgr *lro_mgr,
498  struct sk_buff *skb,
499  void *priv)
500 {
501  if (__lro_proc_skb(lro_mgr, skb, priv)) {
502  if (lro_mgr->features & LRO_F_NAPI)
503  netif_receive_skb(skb);
504  else
505  netif_rx(skb);
506  }
507 }
509 
510 void lro_receive_frags(struct net_lro_mgr *lro_mgr,
511  struct skb_frag_struct *frags,
512  int len, int true_size, void *priv, __wsum sum)
513 {
514  struct sk_buff *skb;
515 
516  skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum);
517  if (!skb)
518  return;
519 
520  if (lro_mgr->features & LRO_F_NAPI)
521  netif_receive_skb(skb);
522  else
523  netif_rx(skb);
524 }
526 
527 void lro_flush_all(struct net_lro_mgr *lro_mgr)
528 {
529  int i;
530  struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
531 
532  for (i = 0; i < lro_mgr->max_desc; i++) {
533  if (lro_desc[i].active)
534  lro_flush(lro_mgr, &lro_desc[i]);
535  }
536 }
538 
539 void lro_flush_pkt(struct net_lro_mgr *lro_mgr,
540  struct iphdr *iph, struct tcphdr *tcph)
541 {
542  struct net_lro_desc *lro_desc;
543 
544  lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
545  if (lro_desc->active)
546  lro_flush(lro_mgr, lro_desc);
547 }