Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
ip_gre.c
Go to the documentation of this file.
1 /*
2  * Linux NET3: GRE over IP protocol decoder.
3  *
4  * Authors: Alexey Kuznetsov ([email protected])
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version
9  * 2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58  Problems & solutions
59  --------------------
60 
61  1. The most important issue is detecting local dead loops.
62  They would cause complete host lockup in transmit, which
63  would be "resolved" by stack overflow or, if queueing is enabled,
64  with infinite looping in net_bh.
65 
66  We cannot track such dead loops during route installation,
67  it is infeasible task. The most general solutions would be
68  to keep skb->encapsulation counter (sort of local ttl),
69  and silently drop packet when it expires. It is a good
70  solution, but it supposes maintaining new variable in ALL
71  skb, even if no tunneling is used.
72 
73  Current solution: xmit_recursion breaks dead loops. This is a percpu
74  counter, since when we enter the first ndo_xmit(), cpu migration is
75  forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77  2. Networking dead loops would not kill routers, but would really
78  kill network. IP hop limit plays role of "t->recursion" in this case,
79  if we copy it from packet being encapsulated to upper header.
80  It is very good solution, but it introduces two problems:
81 
82  - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83  do not work over tunnels.
84  - traceroute does not work. I planned to relay ICMP from tunnel,
85  so that this problem would be solved and traceroute output
86  would even more informative. This idea appeared to be wrong:
87  only Linux complies to rfc1812 now (yes, guys, Linux is the only
88  true router now :-)), all routers (at least, in neighbourhood of mine)
89  return only 8 bytes of payload. It is the end.
90 
91  Hence, if we want that OSPF worked or traceroute said something reasonable,
92  we should search for another solution.
93 
94  One of them is to parse packet trying to detect inner encapsulation
95  made by our node. It is difficult or even impossible, especially,
96  taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98  Current solution: The solution was UNEXPECTEDLY SIMPLE.
99  We force DF flag on tunnels with preconfigured hop limit,
100  that is ALL. :-) Well, it does not remove the problem completely,
101  but exponential growth of network traffic is changed to linear
102  (branches, that exceed pmtu are pruned) and tunnel mtu
103  rapidly degrades to value <68, where looping stops.
104  Yes, it is not good if there exists a router in the loop,
105  which does not force DF, even when encapsulating packets have DF set.
106  But it is not our problem! Nobody could accuse us, we made
107  all that we could make. Even if it is your gated who injected
108  fatal route to network, even if it were you who configured
109  fatal static route: you are innocent. :-)
110 
111 
112 
113  3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114  practically identical code. It would be good to glue them
115  together, but it is not very evident, how to make them modular.
116  sit is integral part of IPv6, ipip and gre are naturally modular.
117  We could extract common parts (hash table, ioctl etc)
118  to a separate module (ip_tunnel.c).
119 
120  Alexey Kuznetsov.
121  */
122 
123 static bool log_ecn_error = true;
124 module_param(log_ecn_error, bool, 0644);
125 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126 
127 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
128 static int ipgre_tunnel_init(struct net_device *dev);
129 static void ipgre_tunnel_setup(struct net_device *dev);
130 static int ipgre_tunnel_bind_dev(struct net_device *dev);
131 
132 /* Fallback tunnel: no source, no destination, no key, no options */
133 
134 #define HASH_SIZE 16
135 
136 static int ipgre_net_id __read_mostly;
137 struct ipgre_net {
139 
141 };
142 
143 /* Tunnel hash table */
144 
145 /*
146  4 hash tables:
147 
148  3: (remote,local)
149  2: (remote,*)
150  1: (*,local)
151  0: (*,*)
152 
153  We require exact key match i.e. if a key is present in packet
154  it will match only tunnel with the same key; if it is not present,
155  it will match only keyless tunnel.
156 
157  All keysless packets, if not matched configured keyless tunnels
158  will match fallback tunnel.
159  */
160 
161 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
162 
163 #define tunnels_r_l tunnels[3]
164 #define tunnels_r tunnels[2]
165 #define tunnels_l tunnels[1]
166 #define tunnels_wc tunnels[0]
167 /*
168  * Locking : hash tables are protected by RCU and RTNL
169  */
170 
171 #define for_each_ip_tunnel_rcu(start) \
172  for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
173 
174 /* often modified stats are per cpu, other are shared (netdev->stats) */
175 struct pcpu_tstats {
181 };
182 
183 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
184  struct rtnl_link_stats64 *tot)
185 {
186  int i;
187 
189  const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
191  unsigned int start;
192 
193  do {
194  start = u64_stats_fetch_begin_bh(&tstats->syncp);
195  rx_packets = tstats->rx_packets;
196  tx_packets = tstats->tx_packets;
197  rx_bytes = tstats->rx_bytes;
198  tx_bytes = tstats->tx_bytes;
199  } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
200 
201  tot->rx_packets += rx_packets;
202  tot->tx_packets += tx_packets;
203  tot->rx_bytes += rx_bytes;
204  tot->tx_bytes += tx_bytes;
205  }
206 
207  tot->multicast = dev->stats.multicast;
208  tot->rx_crc_errors = dev->stats.rx_crc_errors;
209  tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
210  tot->rx_length_errors = dev->stats.rx_length_errors;
211  tot->rx_frame_errors = dev->stats.rx_frame_errors;
212  tot->rx_errors = dev->stats.rx_errors;
213 
214  tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
215  tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
216  tot->tx_dropped = dev->stats.tx_dropped;
217  tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
218  tot->tx_errors = dev->stats.tx_errors;
219 
220  return tot;
221 }
222 
223 /* Does key in tunnel parameters match packet */
224 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
226 {
227  if (p->i_flags & GRE_KEY) {
228  if (flags & GRE_KEY)
229  return key == p->i_key;
230  else
231  return false; /* key expected, none present */
232  } else
233  return !(flags & GRE_KEY);
234 }
235 
236 /* Given src, dst and key, find appropriate for input tunnel. */
237 
238 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
239  __be32 remote, __be32 local,
240  __be16 flags, __be32 key,
241  __be16 gre_proto)
242 {
243  struct net *net = dev_net(dev);
244  int link = dev->ifindex;
245  unsigned int h0 = HASH(remote);
246  unsigned int h1 = HASH(key);
247  struct ip_tunnel *t, *cand = NULL;
248  struct ipgre_net *ign = net_generic(net, ipgre_net_id);
249  int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
251  int score, cand_score = 4;
252 
253  for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
254  if (local != t->parms.iph.saddr ||
255  remote != t->parms.iph.daddr ||
256  !(t->dev->flags & IFF_UP))
257  continue;
258 
259  if (!ipgre_key_match(&t->parms, flags, key))
260  continue;
261 
262  if (t->dev->type != ARPHRD_IPGRE &&
263  t->dev->type != dev_type)
264  continue;
265 
266  score = 0;
267  if (t->parms.link != link)
268  score |= 1;
269  if (t->dev->type != dev_type)
270  score |= 2;
271  if (score == 0)
272  return t;
273 
274  if (score < cand_score) {
275  cand = t;
276  cand_score = score;
277  }
278  }
279 
280  for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
281  if (remote != t->parms.iph.daddr ||
282  !(t->dev->flags & IFF_UP))
283  continue;
284 
285  if (!ipgre_key_match(&t->parms, flags, key))
286  continue;
287 
288  if (t->dev->type != ARPHRD_IPGRE &&
289  t->dev->type != dev_type)
290  continue;
291 
292  score = 0;
293  if (t->parms.link != link)
294  score |= 1;
295  if (t->dev->type != dev_type)
296  score |= 2;
297  if (score == 0)
298  return t;
299 
300  if (score < cand_score) {
301  cand = t;
302  cand_score = score;
303  }
304  }
305 
306  for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
307  if ((local != t->parms.iph.saddr &&
308  (local != t->parms.iph.daddr ||
309  !ipv4_is_multicast(local))) ||
310  !(t->dev->flags & IFF_UP))
311  continue;
312 
313  if (!ipgre_key_match(&t->parms, flags, key))
314  continue;
315 
316  if (t->dev->type != ARPHRD_IPGRE &&
317  t->dev->type != dev_type)
318  continue;
319 
320  score = 0;
321  if (t->parms.link != link)
322  score |= 1;
323  if (t->dev->type != dev_type)
324  score |= 2;
325  if (score == 0)
326  return t;
327 
328  if (score < cand_score) {
329  cand = t;
330  cand_score = score;
331  }
332  }
333 
334  for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
335  if (t->parms.i_key != key ||
336  !(t->dev->flags & IFF_UP))
337  continue;
338 
339  if (t->dev->type != ARPHRD_IPGRE &&
340  t->dev->type != dev_type)
341  continue;
342 
343  score = 0;
344  if (t->parms.link != link)
345  score |= 1;
346  if (t->dev->type != dev_type)
347  score |= 2;
348  if (score == 0)
349  return t;
350 
351  if (score < cand_score) {
352  cand = t;
353  cand_score = score;
354  }
355  }
356 
357  if (cand != NULL)
358  return cand;
359 
360  dev = ign->fb_tunnel_dev;
361  if (dev->flags & IFF_UP)
362  return netdev_priv(dev);
363 
364  return NULL;
365 }
366 
367 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
368  struct ip_tunnel_parm *parms)
369 {
370  __be32 remote = parms->iph.daddr;
371  __be32 local = parms->iph.saddr;
372  __be32 key = parms->i_key;
373  unsigned int h = HASH(key);
374  int prio = 0;
375 
376  if (local)
377  prio |= 1;
378  if (remote && !ipv4_is_multicast(remote)) {
379  prio |= 2;
380  h ^= HASH(remote);
381  }
382 
383  return &ign->tunnels[prio][h];
384 }
385 
386 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
387  struct ip_tunnel *t)
388 {
389  return __ipgre_bucket(ign, &t->parms);
390 }
391 
392 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
393 {
394  struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
395 
397  rcu_assign_pointer(*tp, t);
398 }
399 
400 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
401 {
402  struct ip_tunnel __rcu **tp;
403  struct ip_tunnel *iter;
404 
405  for (tp = ipgre_bucket(ign, t);
406  (iter = rtnl_dereference(*tp)) != NULL;
407  tp = &iter->next) {
408  if (t == iter) {
409  rcu_assign_pointer(*tp, t->next);
410  break;
411  }
412  }
413 }
414 
415 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
416  struct ip_tunnel_parm *parms,
417  int type)
418 {
419  __be32 remote = parms->iph.daddr;
420  __be32 local = parms->iph.saddr;
421  __be32 key = parms->i_key;
422  int link = parms->link;
423  struct ip_tunnel *t;
424  struct ip_tunnel __rcu **tp;
425  struct ipgre_net *ign = net_generic(net, ipgre_net_id);
426 
427  for (tp = __ipgre_bucket(ign, parms);
428  (t = rtnl_dereference(*tp)) != NULL;
429  tp = &t->next)
430  if (local == t->parms.iph.saddr &&
431  remote == t->parms.iph.daddr &&
432  key == t->parms.i_key &&
433  link == t->parms.link &&
434  type == t->dev->type)
435  break;
436 
437  return t;
438 }
439 
440 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
441  struct ip_tunnel_parm *parms, int create)
442 {
443  struct ip_tunnel *t, *nt;
444  struct net_device *dev;
445  char name[IFNAMSIZ];
446  struct ipgre_net *ign = net_generic(net, ipgre_net_id);
447 
448  t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
449  if (t || !create)
450  return t;
451 
452  if (parms->name[0])
453  strlcpy(name, parms->name, IFNAMSIZ);
454  else
455  strcpy(name, "gre%d");
456 
457  dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
458  if (!dev)
459  return NULL;
460 
461  dev_net_set(dev, net);
462 
463  nt = netdev_priv(dev);
464  nt->parms = *parms;
465  dev->rtnl_link_ops = &ipgre_link_ops;
466 
467  dev->mtu = ipgre_tunnel_bind_dev(dev);
468 
469  if (register_netdevice(dev) < 0)
470  goto failed_free;
471 
472  /* Can use a lockless transmit, unless we generate output sequences */
473  if (!(nt->parms.o_flags & GRE_SEQ))
474  dev->features |= NETIF_F_LLTX;
475 
476  dev_hold(dev);
477  ipgre_tunnel_link(ign, nt);
478  return nt;
479 
480 failed_free:
481  free_netdev(dev);
482  return NULL;
483 }
484 
485 static void ipgre_tunnel_uninit(struct net_device *dev)
486 {
487  struct net *net = dev_net(dev);
488  struct ipgre_net *ign = net_generic(net, ipgre_net_id);
489 
490  ipgre_tunnel_unlink(ign, netdev_priv(dev));
491  dev_put(dev);
492 }
493 
494 
495 static void ipgre_err(struct sk_buff *skb, u32 info)
496 {
497 
498 /* All the routers (except for Linux) return only
499  8 bytes of packet payload. It means, that precise relaying of
500  ICMP in the real Internet is absolutely infeasible.
501 
502  Moreover, Cisco "wise men" put GRE key to the third word
503  in GRE header. It makes impossible maintaining even soft state for keyed
504  GRE tunnels with enabled checksum. Tell them "thank you".
505 
506  Well, I wonder, rfc1812 was written by Cisco employee,
507  what the hell these idiots break standards established
508  by themselves???
509  */
510 
511  const struct iphdr *iph = (const struct iphdr *)skb->data;
512  __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2));
513  int grehlen = (iph->ihl<<2) + 4;
514  const int type = icmp_hdr(skb)->type;
515  const int code = icmp_hdr(skb)->code;
516  struct ip_tunnel *t;
517  __be16 flags;
518  __be32 key = 0;
519 
520  flags = p[0];
522  if (flags&(GRE_VERSION|GRE_ROUTING))
523  return;
524  if (flags&GRE_KEY) {
525  grehlen += 4;
526  if (flags&GRE_CSUM)
527  grehlen += 4;
528  }
529  }
530 
531  /* If only 8 bytes returned, keyed message will be dropped here */
532  if (skb_headlen(skb) < grehlen)
533  return;
534 
535  if (flags & GRE_KEY)
536  key = *(((__be32 *)p) + (grehlen / 4) - 1);
537 
538  switch (type) {
539  default:
540  case ICMP_PARAMETERPROB:
541  return;
542 
543  case ICMP_DEST_UNREACH:
544  switch (code) {
545  case ICMP_SR_FAILED:
546  case ICMP_PORT_UNREACH:
547  /* Impossible event. */
548  return;
549  default:
550  /* All others are translated to HOST_UNREACH.
551  rfc2003 contains "deep thoughts" about NET_UNREACH,
552  I believe they are just ether pollution. --ANK
553  */
554  break;
555  }
556  break;
557  case ICMP_TIME_EXCEEDED:
558  if (code != ICMP_EXC_TTL)
559  return;
560  break;
561 
562  case ICMP_REDIRECT:
563  break;
564  }
565 
566  t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
567  flags, key, p[1]);
568 
569  if (t == NULL)
570  return;
571 
572  if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
573  ipv4_update_pmtu(skb, dev_net(skb->dev), info,
574  t->parms.link, 0, IPPROTO_GRE, 0);
575  return;
576  }
577  if (type == ICMP_REDIRECT) {
578  ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
579  IPPROTO_GRE, 0);
580  return;
581  }
582  if (t->parms.iph.daddr == 0 ||
583  ipv4_is_multicast(t->parms.iph.daddr))
584  return;
585 
586  if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
587  return;
588 
589  if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
590  t->err_count++;
591  else
592  t->err_count = 1;
593  t->err_time = jiffies;
594 }
595 
596 static inline u8
597 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
598 {
599  u8 inner = 0;
600  if (skb->protocol == htons(ETH_P_IP))
601  inner = old_iph->tos;
602  else if (skb->protocol == htons(ETH_P_IPV6))
603  inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
604  return INET_ECN_encapsulate(tos, inner);
605 }
606 
607 static int ipgre_rcv(struct sk_buff *skb)
608 {
609  const struct iphdr *iph;
610  u8 *h;
611  __be16 flags;
612  __sum16 csum = 0;
613  __be32 key = 0;
614  u32 seqno = 0;
615  struct ip_tunnel *tunnel;
616  int offset = 4;
617  __be16 gre_proto;
618  int err;
619 
620  if (!pskb_may_pull(skb, 16))
621  goto drop;
622 
623  iph = ip_hdr(skb);
624  h = skb->data;
625  flags = *(__be16 *)h;
626 
627  if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
628  /* - Version must be 0.
629  - We do not support routing headers.
630  */
631  if (flags&(GRE_VERSION|GRE_ROUTING))
632  goto drop;
633 
634  if (flags&GRE_CSUM) {
635  switch (skb->ip_summed) {
636  case CHECKSUM_COMPLETE:
637  csum = csum_fold(skb->csum);
638  if (!csum)
639  break;
640  /* fall through */
641  case CHECKSUM_NONE:
642  skb->csum = 0;
643  csum = __skb_checksum_complete(skb);
645  }
646  offset += 4;
647  }
648  if (flags&GRE_KEY) {
649  key = *(__be32 *)(h + offset);
650  offset += 4;
651  }
652  if (flags&GRE_SEQ) {
653  seqno = ntohl(*(__be32 *)(h + offset));
654  offset += 4;
655  }
656  }
657 
658  gre_proto = *(__be16 *)(h + 2);
659 
660  tunnel = ipgre_tunnel_lookup(skb->dev,
661  iph->saddr, iph->daddr, flags, key,
662  gre_proto);
663  if (tunnel) {
664  struct pcpu_tstats *tstats;
665 
666  secpath_reset(skb);
667 
668  skb->protocol = gre_proto;
669  /* WCCP version 1 and 2 protocol decoding.
670  * - Change protocol to IP
671  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
672  */
673  if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
674  skb->protocol = htons(ETH_P_IP);
675  if ((*(h + offset) & 0xF0) != 0x40)
676  offset += 4;
677  }
678 
679  skb->mac_header = skb->network_header;
680  __pskb_pull(skb, offset);
681  skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
682  skb->pkt_type = PACKET_HOST;
683 #ifdef CONFIG_NET_IPGRE_BROADCAST
684  if (ipv4_is_multicast(iph->daddr)) {
685  /* Looped back packet, drop it! */
686  if (rt_is_output_route(skb_rtable(skb)))
687  goto drop;
688  tunnel->dev->stats.multicast++;
689  skb->pkt_type = PACKET_BROADCAST;
690  }
691 #endif
692 
693  if (((flags&GRE_CSUM) && csum) ||
694  (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
695  tunnel->dev->stats.rx_crc_errors++;
696  tunnel->dev->stats.rx_errors++;
697  goto drop;
698  }
699  if (tunnel->parms.i_flags&GRE_SEQ) {
700  if (!(flags&GRE_SEQ) ||
701  (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
702  tunnel->dev->stats.rx_fifo_errors++;
703  tunnel->dev->stats.rx_errors++;
704  goto drop;
705  }
706  tunnel->i_seqno = seqno + 1;
707  }
708 
709  /* Warning: All skb pointers will be invalidated! */
710  if (tunnel->dev->type == ARPHRD_ETHER) {
711  if (!pskb_may_pull(skb, ETH_HLEN)) {
712  tunnel->dev->stats.rx_length_errors++;
713  tunnel->dev->stats.rx_errors++;
714  goto drop;
715  }
716 
717  iph = ip_hdr(skb);
718  skb->protocol = eth_type_trans(skb, tunnel->dev);
719  skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
720  }
721 
722  __skb_tunnel_rx(skb, tunnel->dev);
723 
724  skb_reset_network_header(skb);
725  err = IP_ECN_decapsulate(iph, skb);
726  if (unlikely(err)) {
727  if (log_ecn_error)
728  net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
729  &iph->saddr, iph->tos);
730  if (err > 1) {
731  ++tunnel->dev->stats.rx_frame_errors;
732  ++tunnel->dev->stats.rx_errors;
733  goto drop;
734  }
735  }
736 
737  tstats = this_cpu_ptr(tunnel->dev->tstats);
738  u64_stats_update_begin(&tstats->syncp);
739  tstats->rx_packets++;
740  tstats->rx_bytes += skb->len;
741  u64_stats_update_end(&tstats->syncp);
742 
743  gro_cells_receive(&tunnel->gro_cells, skb);
744  return 0;
745  }
747 
748 drop:
749  kfree_skb(skb);
750  return 0;
751 }
752 
753 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
754 {
755  struct ip_tunnel *tunnel = netdev_priv(dev);
756  struct pcpu_tstats *tstats;
757  const struct iphdr *old_iph = ip_hdr(skb);
758  const struct iphdr *tiph;
759  struct flowi4 fl4;
760  u8 tos;
761  __be16 df;
762  struct rtable *rt; /* Route to the other host */
763  struct net_device *tdev; /* Device to other host */
764  struct iphdr *iph; /* Our new IP header */
765  unsigned int max_headroom; /* The extra header space needed */
766  int gre_hlen;
767  __be32 dst;
768  int mtu;
769 
770  if (skb->ip_summed == CHECKSUM_PARTIAL &&
771  skb_checksum_help(skb))
772  goto tx_error;
773 
774  if (dev->type == ARPHRD_ETHER)
775  IPCB(skb)->flags = 0;
776 
777  if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
778  gre_hlen = 0;
779  tiph = (const struct iphdr *)skb->data;
780  } else {
781  gre_hlen = tunnel->hlen;
782  tiph = &tunnel->parms.iph;
783  }
784 
785  if ((dst = tiph->daddr) == 0) {
786  /* NBMA tunnel */
787 
788  if (skb_dst(skb) == NULL) {
789  dev->stats.tx_fifo_errors++;
790  goto tx_error;
791  }
792 
793  if (skb->protocol == htons(ETH_P_IP)) {
794  rt = skb_rtable(skb);
795  dst = rt_nexthop(rt, old_iph->daddr);
796  }
797 #if IS_ENABLED(CONFIG_IPV6)
798  else if (skb->protocol == htons(ETH_P_IPV6)) {
799  const struct in6_addr *addr6;
800  struct neighbour *neigh;
801  bool do_tx_error_icmp;
802  int addr_type;
803 
804  neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
805  if (neigh == NULL)
806  goto tx_error;
807 
808  addr6 = (const struct in6_addr *)&neigh->primary_key;
809  addr_type = ipv6_addr_type(addr6);
810 
811  if (addr_type == IPV6_ADDR_ANY) {
812  addr6 = &ipv6_hdr(skb)->daddr;
813  addr_type = ipv6_addr_type(addr6);
814  }
815 
816  if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
817  do_tx_error_icmp = true;
818  else {
819  do_tx_error_icmp = false;
820  dst = addr6->s6_addr32[3];
821  }
822  neigh_release(neigh);
823  if (do_tx_error_icmp)
824  goto tx_error_icmp;
825  }
826 #endif
827  else
828  goto tx_error;
829  }
830 
831  tos = tiph->tos;
832  if (tos == 1) {
833  tos = 0;
834  if (skb->protocol == htons(ETH_P_IP))
835  tos = old_iph->tos;
836  else if (skb->protocol == htons(ETH_P_IPV6))
837  tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
838  }
839 
840  rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
841  tunnel->parms.o_key, RT_TOS(tos),
842  tunnel->parms.link);
843  if (IS_ERR(rt)) {
844  dev->stats.tx_carrier_errors++;
845  goto tx_error;
846  }
847  tdev = rt->dst.dev;
848 
849  if (tdev == dev) {
850  ip_rt_put(rt);
851  dev->stats.collisions++;
852  goto tx_error;
853  }
854 
855  df = tiph->frag_off;
856  if (df)
857  mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
858  else
859  mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
860 
861  if (skb_dst(skb))
862  skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
863 
864  if (skb->protocol == htons(ETH_P_IP)) {
865  df |= (old_iph->frag_off&htons(IP_DF));
866 
867  if ((old_iph->frag_off&htons(IP_DF)) &&
868  mtu < ntohs(old_iph->tot_len)) {
870  ip_rt_put(rt);
871  goto tx_error;
872  }
873  }
874 #if IS_ENABLED(CONFIG_IPV6)
875  else if (skb->protocol == htons(ETH_P_IPV6)) {
876  struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
877 
878  if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
879  if ((tunnel->parms.iph.daddr &&
880  !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
881  rt6->rt6i_dst.plen == 128) {
882  rt6->rt6i_flags |= RTF_MODIFIED;
883  dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
884  }
885  }
886 
887  if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
888  icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
889  ip_rt_put(rt);
890  goto tx_error;
891  }
892  }
893 #endif
894 
895  if (tunnel->err_count > 0) {
896  if (time_before(jiffies,
897  tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
898  tunnel->err_count--;
899 
900  dst_link_failure(skb);
901  } else
902  tunnel->err_count = 0;
903  }
904 
905  max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
906 
907  if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
908  (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
909  struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
910  if (max_headroom > dev->needed_headroom)
911  dev->needed_headroom = max_headroom;
912  if (!new_skb) {
913  ip_rt_put(rt);
914  dev->stats.tx_dropped++;
915  dev_kfree_skb(skb);
916  return NETDEV_TX_OK;
917  }
918  if (skb->sk)
919  skb_set_owner_w(new_skb, skb->sk);
920  dev_kfree_skb(skb);
921  skb = new_skb;
922  old_iph = ip_hdr(skb);
923  }
924 
925  skb_reset_transport_header(skb);
926  skb_push(skb, gre_hlen);
927  skb_reset_network_header(skb);
928  memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
931  skb_dst_drop(skb);
932  skb_dst_set(skb, &rt->dst);
933 
934  /*
935  * Push down and install the IPIP header.
936  */
937 
938  iph = ip_hdr(skb);
939  iph->version = 4;
940  iph->ihl = sizeof(struct iphdr) >> 2;
941  iph->frag_off = df;
942  iph->protocol = IPPROTO_GRE;
943  iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
944  iph->daddr = fl4.daddr;
945  iph->saddr = fl4.saddr;
946 
947  if ((iph->ttl = tiph->ttl) == 0) {
948  if (skb->protocol == htons(ETH_P_IP))
949  iph->ttl = old_iph->ttl;
950 #if IS_ENABLED(CONFIG_IPV6)
951  else if (skb->protocol == htons(ETH_P_IPV6))
952  iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
953 #endif
954  else
955  iph->ttl = ip4_dst_hoplimit(&rt->dst);
956  }
957 
958  ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
959  ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
960  htons(ETH_P_TEB) : skb->protocol;
961 
962  if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
963  __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
964 
965  if (tunnel->parms.o_flags&GRE_SEQ) {
966  ++tunnel->o_seqno;
967  *ptr = htonl(tunnel->o_seqno);
968  ptr--;
969  }
970  if (tunnel->parms.o_flags&GRE_KEY) {
971  *ptr = tunnel->parms.o_key;
972  ptr--;
973  }
974  if (tunnel->parms.o_flags&GRE_CSUM) {
975  *ptr = 0;
976  *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
977  }
978  }
979 
980  nf_reset(skb);
981  tstats = this_cpu_ptr(dev->tstats);
982  __IPTUNNEL_XMIT(tstats, &dev->stats);
983  return NETDEV_TX_OK;
984 
985 #if IS_ENABLED(CONFIG_IPV6)
986 tx_error_icmp:
987  dst_link_failure(skb);
988 #endif
989 tx_error:
990  dev->stats.tx_errors++;
991  dev_kfree_skb(skb);
992  return NETDEV_TX_OK;
993 }
994 
995 static int ipgre_tunnel_bind_dev(struct net_device *dev)
996 {
997  struct net_device *tdev = NULL;
998  struct ip_tunnel *tunnel;
999  const struct iphdr *iph;
1000  int hlen = LL_MAX_HEADER;
1001  int mtu = ETH_DATA_LEN;
1002  int addend = sizeof(struct iphdr) + 4;
1003 
1004  tunnel = netdev_priv(dev);
1005  iph = &tunnel->parms.iph;
1006 
1007  /* Guess output device to choose reasonable mtu and needed_headroom */
1008 
1009  if (iph->daddr) {
1010  struct flowi4 fl4;
1011  struct rtable *rt;
1012 
1013  rt = ip_route_output_gre(dev_net(dev), &fl4,
1014  iph->daddr, iph->saddr,
1015  tunnel->parms.o_key,
1016  RT_TOS(iph->tos),
1017  tunnel->parms.link);
1018  if (!IS_ERR(rt)) {
1019  tdev = rt->dst.dev;
1020  ip_rt_put(rt);
1021  }
1022 
1023  if (dev->type != ARPHRD_ETHER)
1024  dev->flags |= IFF_POINTOPOINT;
1025  }
1026 
1027  if (!tdev && tunnel->parms.link)
1028  tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1029 
1030  if (tdev) {
1031  hlen = tdev->hard_header_len + tdev->needed_headroom;
1032  mtu = tdev->mtu;
1033  }
1034  dev->iflink = tunnel->parms.link;
1035 
1036  /* Precalculate GRE options length */
1037  if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1038  if (tunnel->parms.o_flags&GRE_CSUM)
1039  addend += 4;
1040  if (tunnel->parms.o_flags&GRE_KEY)
1041  addend += 4;
1042  if (tunnel->parms.o_flags&GRE_SEQ)
1043  addend += 4;
1044  }
1045  dev->needed_headroom = addend + hlen;
1046  mtu -= dev->hard_header_len + addend;
1047 
1048  if (mtu < 68)
1049  mtu = 68;
1050 
1051  tunnel->hlen = addend;
1052 
1053  return mtu;
1054 }
1055 
1056 static int
1057 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1058 {
1059  int err = 0;
1060  struct ip_tunnel_parm p;
1061  struct ip_tunnel *t;
1062  struct net *net = dev_net(dev);
1063  struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1064 
1065  switch (cmd) {
1066  case SIOCGETTUNNEL:
1067  t = NULL;
1068  if (dev == ign->fb_tunnel_dev) {
1069  if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1070  err = -EFAULT;
1071  break;
1072  }
1073  t = ipgre_tunnel_locate(net, &p, 0);
1074  }
1075  if (t == NULL)
1076  t = netdev_priv(dev);
1077  memcpy(&p, &t->parms, sizeof(p));
1078  if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1079  err = -EFAULT;
1080  break;
1081 
1082  case SIOCADDTUNNEL:
1083  case SIOCCHGTUNNEL:
1084  err = -EPERM;
1085  if (!capable(CAP_NET_ADMIN))
1086  goto done;
1087 
1088  err = -EFAULT;
1089  if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1090  goto done;
1091 
1092  err = -EINVAL;
1093  if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1094  p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1096  goto done;
1097  if (p.iph.ttl)
1098  p.iph.frag_off |= htons(IP_DF);
1099 
1100  if (!(p.i_flags&GRE_KEY))
1101  p.i_key = 0;
1102  if (!(p.o_flags&GRE_KEY))
1103  p.o_key = 0;
1104 
1105  t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1106 
1107  if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1108  if (t != NULL) {
1109  if (t->dev != dev) {
1110  err = -EEXIST;
1111  break;
1112  }
1113  } else {
1114  unsigned int nflags = 0;
1115 
1116  t = netdev_priv(dev);
1117 
1118  if (ipv4_is_multicast(p.iph.daddr))
1119  nflags = IFF_BROADCAST;
1120  else if (p.iph.daddr)
1121  nflags = IFF_POINTOPOINT;
1122 
1123  if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1124  err = -EINVAL;
1125  break;
1126  }
1127  ipgre_tunnel_unlink(ign, t);
1128  synchronize_net();
1129  t->parms.iph.saddr = p.iph.saddr;
1130  t->parms.iph.daddr = p.iph.daddr;
1131  t->parms.i_key = p.i_key;
1132  t->parms.o_key = p.o_key;
1133  memcpy(dev->dev_addr, &p.iph.saddr, 4);
1134  memcpy(dev->broadcast, &p.iph.daddr, 4);
1135  ipgre_tunnel_link(ign, t);
1136  netdev_state_change(dev);
1137  }
1138  }
1139 
1140  if (t) {
1141  err = 0;
1142  if (cmd == SIOCCHGTUNNEL) {
1143  t->parms.iph.ttl = p.iph.ttl;
1144  t->parms.iph.tos = p.iph.tos;
1145  t->parms.iph.frag_off = p.iph.frag_off;
1146  if (t->parms.link != p.link) {
1147  t->parms.link = p.link;
1148  dev->mtu = ipgre_tunnel_bind_dev(dev);
1149  netdev_state_change(dev);
1150  }
1151  }
1152  if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1153  err = -EFAULT;
1154  } else
1155  err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1156  break;
1157 
1158  case SIOCDELTUNNEL:
1159  err = -EPERM;
1160  if (!capable(CAP_NET_ADMIN))
1161  goto done;
1162 
1163  if (dev == ign->fb_tunnel_dev) {
1164  err = -EFAULT;
1165  if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1166  goto done;
1167  err = -ENOENT;
1168  if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1169  goto done;
1170  err = -EPERM;
1171  if (t == netdev_priv(ign->fb_tunnel_dev))
1172  goto done;
1173  dev = t->dev;
1174  }
1175  unregister_netdevice(dev);
1176  err = 0;
1177  break;
1178 
1179  default:
1180  err = -EINVAL;
1181  }
1182 
1183 done:
1184  return err;
1185 }
1186 
1187 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1188 {
1189  struct ip_tunnel *tunnel = netdev_priv(dev);
1190  if (new_mtu < 68 ||
1191  new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1192  return -EINVAL;
1193  dev->mtu = new_mtu;
1194  return 0;
1195 }
1196 
1197 /* Nice toy. Unfortunately, useless in real life :-)
1198  It allows to construct virtual multiprotocol broadcast "LAN"
1199  over the Internet, provided multicast routing is tuned.
1200 
1201 
1202  I have no idea was this bicycle invented before me,
1203  so that I had to set ARPHRD_IPGRE to a random value.
1204  I have an impression, that Cisco could make something similar,
1205  but this feature is apparently missing in IOS<=11.2(8).
1206 
1207  I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1208  with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1209 
1210  ping -t 255 224.66.66.66
1211 
1212  If nobody answers, mbone does not work.
1213 
1214  ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1215  ip addr add 10.66.66.<somewhat>/24 dev Universe
1216  ifconfig Universe up
1217  ifconfig Universe add fe80::<Your_real_addr>/10
1218  ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1219  ftp 10.66.66.66
1220  ...
1221  ftp fec0:6666:6666::193.233.7.65
1222  ...
1223 
1224  */
1225 
1226 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1227  unsigned short type,
1228  const void *daddr, const void *saddr, unsigned int len)
1229 {
1230  struct ip_tunnel *t = netdev_priv(dev);
1231  struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1232  __be16 *p = (__be16 *)(iph+1);
1233 
1234  memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1235  p[0] = t->parms.o_flags;
1236  p[1] = htons(type);
1237 
1238  /*
1239  * Set the source hardware address.
1240  */
1241 
1242  if (saddr)
1243  memcpy(&iph->saddr, saddr, 4);
1244  if (daddr)
1245  memcpy(&iph->daddr, daddr, 4);
1246  if (iph->daddr)
1247  return t->hlen;
1248 
1249  return -t->hlen;
1250 }
1251 
1252 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1253 {
1254  const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1255  memcpy(haddr, &iph->saddr, 4);
1256  return 4;
1257 }
1258 
1259 static const struct header_ops ipgre_header_ops = {
1260  .create = ipgre_header,
1261  .parse = ipgre_header_parse,
1262 };
1263 
1264 #ifdef CONFIG_NET_IPGRE_BROADCAST
1265 static int ipgre_open(struct net_device *dev)
1266 {
1267  struct ip_tunnel *t = netdev_priv(dev);
1268 
1269  if (ipv4_is_multicast(t->parms.iph.daddr)) {
1270  struct flowi4 fl4;
1271  struct rtable *rt;
1272 
1273  rt = ip_route_output_gre(dev_net(dev), &fl4,
1274  t->parms.iph.daddr,
1275  t->parms.iph.saddr,
1276  t->parms.o_key,
1277  RT_TOS(t->parms.iph.tos),
1278  t->parms.link);
1279  if (IS_ERR(rt))
1280  return -EADDRNOTAVAIL;
1281  dev = rt->dst.dev;
1282  ip_rt_put(rt);
1283  if (__in_dev_get_rtnl(dev) == NULL)
1284  return -EADDRNOTAVAIL;
1285  t->mlink = dev->ifindex;
1286  ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1287  }
1288  return 0;
1289 }
1290 
1291 static int ipgre_close(struct net_device *dev)
1292 {
1293  struct ip_tunnel *t = netdev_priv(dev);
1294 
1295  if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1296  struct in_device *in_dev;
1297  in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1298  if (in_dev)
1299  ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1300  }
1301  return 0;
1302 }
1303 
1304 #endif
1305 
1306 static const struct net_device_ops ipgre_netdev_ops = {
1307  .ndo_init = ipgre_tunnel_init,
1308  .ndo_uninit = ipgre_tunnel_uninit,
1309 #ifdef CONFIG_NET_IPGRE_BROADCAST
1310  .ndo_open = ipgre_open,
1311  .ndo_stop = ipgre_close,
1312 #endif
1313  .ndo_start_xmit = ipgre_tunnel_xmit,
1314  .ndo_do_ioctl = ipgre_tunnel_ioctl,
1315  .ndo_change_mtu = ipgre_tunnel_change_mtu,
1316  .ndo_get_stats64 = ipgre_get_stats64,
1317 };
1318 
1319 static void ipgre_dev_free(struct net_device *dev)
1320 {
1321  struct ip_tunnel *tunnel = netdev_priv(dev);
1322 
1323  gro_cells_destroy(&tunnel->gro_cells);
1324  free_percpu(dev->tstats);
1325  free_netdev(dev);
1326 }
1327 
1328 #define GRE_FEATURES (NETIF_F_SG | \
1329  NETIF_F_FRAGLIST | \
1330  NETIF_F_HIGHDMA | \
1331  NETIF_F_HW_CSUM)
1332 
1333 static void ipgre_tunnel_setup(struct net_device *dev)
1334 {
1335  dev->netdev_ops = &ipgre_netdev_ops;
1336  dev->destructor = ipgre_dev_free;
1337 
1338  dev->type = ARPHRD_IPGRE;
1339  dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1340  dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1341  dev->flags = IFF_NOARP;
1342  dev->iflink = 0;
1343  dev->addr_len = 4;
1344  dev->features |= NETIF_F_NETNS_LOCAL;
1346 
1347  dev->features |= GRE_FEATURES;
1348  dev->hw_features |= GRE_FEATURES;
1349 }
1350 
1351 static int ipgre_tunnel_init(struct net_device *dev)
1352 {
1353  struct ip_tunnel *tunnel;
1354  struct iphdr *iph;
1355  int err;
1356 
1357  tunnel = netdev_priv(dev);
1358  iph = &tunnel->parms.iph;
1359 
1360  tunnel->dev = dev;
1361  strcpy(tunnel->parms.name, dev->name);
1362 
1363  memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1364  memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1365 
1366  if (iph->daddr) {
1367 #ifdef CONFIG_NET_IPGRE_BROADCAST
1368  if (ipv4_is_multicast(iph->daddr)) {
1369  if (!iph->saddr)
1370  return -EINVAL;
1371  dev->flags = IFF_BROADCAST;
1372  dev->header_ops = &ipgre_header_ops;
1373  }
1374 #endif
1375  } else
1376  dev->header_ops = &ipgre_header_ops;
1377 
1378  dev->tstats = alloc_percpu(struct pcpu_tstats);
1379  if (!dev->tstats)
1380  return -ENOMEM;
1381 
1382  err = gro_cells_init(&tunnel->gro_cells, dev);
1383  if (err) {
1384  free_percpu(dev->tstats);
1385  return err;
1386  }
1387 
1388  return 0;
1389 }
1390 
1391 static void ipgre_fb_tunnel_init(struct net_device *dev)
1392 {
1393  struct ip_tunnel *tunnel = netdev_priv(dev);
1394  struct iphdr *iph = &tunnel->parms.iph;
1395 
1396  tunnel->dev = dev;
1397  strcpy(tunnel->parms.name, dev->name);
1398 
1399  iph->version = 4;
1400  iph->protocol = IPPROTO_GRE;
1401  iph->ihl = 5;
1402  tunnel->hlen = sizeof(struct iphdr) + 4;
1403 
1404  dev_hold(dev);
1405 }
1406 
1407 
1408 static const struct gre_protocol ipgre_protocol = {
1409  .handler = ipgre_rcv,
1410  .err_handler = ipgre_err,
1411 };
1412 
1413 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1414 {
1415  int prio;
1416 
1417  for (prio = 0; prio < 4; prio++) {
1418  int h;
1419  for (h = 0; h < HASH_SIZE; h++) {
1420  struct ip_tunnel *t;
1421 
1422  t = rtnl_dereference(ign->tunnels[prio][h]);
1423 
1424  while (t != NULL) {
1425  unregister_netdevice_queue(t->dev, head);
1426  t = rtnl_dereference(t->next);
1427  }
1428  }
1429  }
1430 }
1431 
1432 static int __net_init ipgre_init_net(struct net *net)
1433 {
1434  struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1435  int err;
1436 
1437  ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1438  ipgre_tunnel_setup);
1439  if (!ign->fb_tunnel_dev) {
1440  err = -ENOMEM;
1441  goto err_alloc_dev;
1442  }
1443  dev_net_set(ign->fb_tunnel_dev, net);
1444 
1445  ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1446  ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1447 
1448  if ((err = register_netdev(ign->fb_tunnel_dev)))
1449  goto err_reg_dev;
1450 
1451  rcu_assign_pointer(ign->tunnels_wc[0],
1452  netdev_priv(ign->fb_tunnel_dev));
1453  return 0;
1454 
1455 err_reg_dev:
1456  ipgre_dev_free(ign->fb_tunnel_dev);
1457 err_alloc_dev:
1458  return err;
1459 }
1460 
1461 static void __net_exit ipgre_exit_net(struct net *net)
1462 {
1463  struct ipgre_net *ign;
1464  LIST_HEAD(list);
1465 
1466  ign = net_generic(net, ipgre_net_id);
1467  rtnl_lock();
1468  ipgre_destroy_tunnels(ign, &list);
1470  rtnl_unlock();
1471 }
1472 
1473 static struct pernet_operations ipgre_net_ops = {
1474  .init = ipgre_init_net,
1475  .exit = ipgre_exit_net,
1476  .id = &ipgre_net_id,
1477  .size = sizeof(struct ipgre_net),
1478 };
1479 
1480 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1481 {
1482  __be16 flags;
1483 
1484  if (!data)
1485  return 0;
1486 
1487  flags = 0;
1488  if (data[IFLA_GRE_IFLAGS])
1489  flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1490  if (data[IFLA_GRE_OFLAGS])
1491  flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1492  if (flags & (GRE_VERSION|GRE_ROUTING))
1493  return -EINVAL;
1494 
1495  return 0;
1496 }
1497 
1498 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1499 {
1500  __be32 daddr;
1501 
1502  if (tb[IFLA_ADDRESS]) {
1503  if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1504  return -EINVAL;
1505  if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1506  return -EADDRNOTAVAIL;
1507  }
1508 
1509  if (!data)
1510  goto out;
1511 
1512  if (data[IFLA_GRE_REMOTE]) {
1513  memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1514  if (!daddr)
1515  return -EINVAL;
1516  }
1517 
1518 out:
1519  return ipgre_tunnel_validate(tb, data);
1520 }
1521 
1522 static void ipgre_netlink_parms(struct nlattr *data[],
1523  struct ip_tunnel_parm *parms)
1524 {
1525  memset(parms, 0, sizeof(*parms));
1526 
1527  parms->iph.protocol = IPPROTO_GRE;
1528 
1529  if (!data)
1530  return;
1531 
1532  if (data[IFLA_GRE_LINK])
1533  parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1534 
1535  if (data[IFLA_GRE_IFLAGS])
1536  parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1537 
1538  if (data[IFLA_GRE_OFLAGS])
1539  parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1540 
1541  if (data[IFLA_GRE_IKEY])
1542  parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1543 
1544  if (data[IFLA_GRE_OKEY])
1545  parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1546 
1547  if (data[IFLA_GRE_LOCAL])
1548  parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1549 
1550  if (data[IFLA_GRE_REMOTE])
1551  parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1552 
1553  if (data[IFLA_GRE_TTL])
1554  parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1555 
1556  if (data[IFLA_GRE_TOS])
1557  parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1558 
1559  if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1560  parms->iph.frag_off = htons(IP_DF);
1561 }
1562 
1563 static int ipgre_tap_init(struct net_device *dev)
1564 {
1565  struct ip_tunnel *tunnel;
1566 
1567  tunnel = netdev_priv(dev);
1568 
1569  tunnel->dev = dev;
1570  strcpy(tunnel->parms.name, dev->name);
1571 
1572  ipgre_tunnel_bind_dev(dev);
1573 
1574  dev->tstats = alloc_percpu(struct pcpu_tstats);
1575  if (!dev->tstats)
1576  return -ENOMEM;
1577 
1578  return 0;
1579 }
1580 
1581 static const struct net_device_ops ipgre_tap_netdev_ops = {
1582  .ndo_init = ipgre_tap_init,
1583  .ndo_uninit = ipgre_tunnel_uninit,
1584  .ndo_start_xmit = ipgre_tunnel_xmit,
1585  .ndo_set_mac_address = eth_mac_addr,
1586  .ndo_validate_addr = eth_validate_addr,
1587  .ndo_change_mtu = ipgre_tunnel_change_mtu,
1588  .ndo_get_stats64 = ipgre_get_stats64,
1589 };
1590 
1591 static void ipgre_tap_setup(struct net_device *dev)
1592 {
1593 
1594  ether_setup(dev);
1595 
1596  dev->netdev_ops = &ipgre_tap_netdev_ops;
1597  dev->destructor = ipgre_dev_free;
1598 
1599  dev->iflink = 0;
1600  dev->features |= NETIF_F_NETNS_LOCAL;
1601 }
1602 
1603 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1604  struct nlattr *data[])
1605 {
1606  struct ip_tunnel *nt;
1607  struct net *net = dev_net(dev);
1608  struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1609  int mtu;
1610  int err;
1611 
1612  nt = netdev_priv(dev);
1613  ipgre_netlink_parms(data, &nt->parms);
1614 
1615  if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1616  return -EEXIST;
1617 
1618  if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1619  eth_hw_addr_random(dev);
1620 
1621  mtu = ipgre_tunnel_bind_dev(dev);
1622  if (!tb[IFLA_MTU])
1623  dev->mtu = mtu;
1624 
1625  /* Can use a lockless transmit, unless we generate output sequences */
1626  if (!(nt->parms.o_flags & GRE_SEQ))
1627  dev->features |= NETIF_F_LLTX;
1628 
1629  err = register_netdevice(dev);
1630  if (err)
1631  goto out;
1632 
1633  dev_hold(dev);
1634  ipgre_tunnel_link(ign, nt);
1635 
1636 out:
1637  return err;
1638 }
1639 
1640 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1641  struct nlattr *data[])
1642 {
1643  struct ip_tunnel *t, *nt;
1644  struct net *net = dev_net(dev);
1645  struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1646  struct ip_tunnel_parm p;
1647  int mtu;
1648 
1649  if (dev == ign->fb_tunnel_dev)
1650  return -EINVAL;
1651 
1652  nt = netdev_priv(dev);
1653  ipgre_netlink_parms(data, &p);
1654 
1655  t = ipgre_tunnel_locate(net, &p, 0);
1656 
1657  if (t) {
1658  if (t->dev != dev)
1659  return -EEXIST;
1660  } else {
1661  t = nt;
1662 
1663  if (dev->type != ARPHRD_ETHER) {
1664  unsigned int nflags = 0;
1665 
1666  if (ipv4_is_multicast(p.iph.daddr))
1667  nflags = IFF_BROADCAST;
1668  else if (p.iph.daddr)
1669  nflags = IFF_POINTOPOINT;
1670 
1671  if ((dev->flags ^ nflags) &
1673  return -EINVAL;
1674  }
1675 
1676  ipgre_tunnel_unlink(ign, t);
1677  t->parms.iph.saddr = p.iph.saddr;
1678  t->parms.iph.daddr = p.iph.daddr;
1679  t->parms.i_key = p.i_key;
1680  if (dev->type != ARPHRD_ETHER) {
1681  memcpy(dev->dev_addr, &p.iph.saddr, 4);
1682  memcpy(dev->broadcast, &p.iph.daddr, 4);
1683  }
1684  ipgre_tunnel_link(ign, t);
1685  netdev_state_change(dev);
1686  }
1687 
1688  t->parms.o_key = p.o_key;
1689  t->parms.iph.ttl = p.iph.ttl;
1690  t->parms.iph.tos = p.iph.tos;
1691  t->parms.iph.frag_off = p.iph.frag_off;
1692 
1693  if (t->parms.link != p.link) {
1694  t->parms.link = p.link;
1695  mtu = ipgre_tunnel_bind_dev(dev);
1696  if (!tb[IFLA_MTU])
1697  dev->mtu = mtu;
1698  netdev_state_change(dev);
1699  }
1700 
1701  return 0;
1702 }
1703 
1704 static size_t ipgre_get_size(const struct net_device *dev)
1705 {
1706  return
1707  /* IFLA_GRE_LINK */
1708  nla_total_size(4) +
1709  /* IFLA_GRE_IFLAGS */
1710  nla_total_size(2) +
1711  /* IFLA_GRE_OFLAGS */
1712  nla_total_size(2) +
1713  /* IFLA_GRE_IKEY */
1714  nla_total_size(4) +
1715  /* IFLA_GRE_OKEY */
1716  nla_total_size(4) +
1717  /* IFLA_GRE_LOCAL */
1718  nla_total_size(4) +
1719  /* IFLA_GRE_REMOTE */
1720  nla_total_size(4) +
1721  /* IFLA_GRE_TTL */
1722  nla_total_size(1) +
1723  /* IFLA_GRE_TOS */
1724  nla_total_size(1) +
1725  /* IFLA_GRE_PMTUDISC */
1726  nla_total_size(1) +
1727  0;
1728 }
1729 
1730 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1731 {
1732  struct ip_tunnel *t = netdev_priv(dev);
1733  struct ip_tunnel_parm *p = &t->parms;
1734 
1735  if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1736  nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1737  nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1738  nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1739  nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1740  nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1741  nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1742  nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1743  nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1744  nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1745  !!(p->iph.frag_off & htons(IP_DF))))
1746  goto nla_put_failure;
1747  return 0;
1748 
1749 nla_put_failure:
1750  return -EMSGSIZE;
1751 }
1752 
1753 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1754  [IFLA_GRE_LINK] = { .type = NLA_U32 },
1755  [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1756  [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1757  [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1758  [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1759  [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1760  [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1761  [IFLA_GRE_TTL] = { .type = NLA_U8 },
1762  [IFLA_GRE_TOS] = { .type = NLA_U8 },
1763  [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1764 };
1765 
1766 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1767  .kind = "gre",
1768  .maxtype = IFLA_GRE_MAX,
1769  .policy = ipgre_policy,
1770  .priv_size = sizeof(struct ip_tunnel),
1771  .setup = ipgre_tunnel_setup,
1772  .validate = ipgre_tunnel_validate,
1773  .newlink = ipgre_newlink,
1774  .changelink = ipgre_changelink,
1775  .get_size = ipgre_get_size,
1776  .fill_info = ipgre_fill_info,
1777 };
1778 
1779 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1780  .kind = "gretap",
1781  .maxtype = IFLA_GRE_MAX,
1782  .policy = ipgre_policy,
1783  .priv_size = sizeof(struct ip_tunnel),
1784  .setup = ipgre_tap_setup,
1785  .validate = ipgre_tap_validate,
1786  .newlink = ipgre_newlink,
1787  .changelink = ipgre_changelink,
1788  .get_size = ipgre_get_size,
1789  .fill_info = ipgre_fill_info,
1790 };
1791 
1792 /*
1793  * And now the modules code and kernel interface.
1794  */
1795 
1796 static int __init ipgre_init(void)
1797 {
1798  int err;
1799 
1800  pr_info("GRE over IPv4 tunneling driver\n");
1801 
1802  err = register_pernet_device(&ipgre_net_ops);
1803  if (err < 0)
1804  return err;
1805 
1806  err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1807  if (err < 0) {
1808  pr_info("%s: can't add protocol\n", __func__);
1809  goto add_proto_failed;
1810  }
1811 
1812  err = rtnl_link_register(&ipgre_link_ops);
1813  if (err < 0)
1814  goto rtnl_link_failed;
1815 
1816  err = rtnl_link_register(&ipgre_tap_ops);
1817  if (err < 0)
1818  goto tap_ops_failed;
1819 
1820 out:
1821  return err;
1822 
1823 tap_ops_failed:
1824  rtnl_link_unregister(&ipgre_link_ops);
1825 rtnl_link_failed:
1826  gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1827 add_proto_failed:
1828  unregister_pernet_device(&ipgre_net_ops);
1829  goto out;
1830 }
1831 
1832 static void __exit ipgre_fini(void)
1833 {
1834  rtnl_link_unregister(&ipgre_tap_ops);
1835  rtnl_link_unregister(&ipgre_link_ops);
1836  if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1837  pr_info("%s: can't remove protocol\n", __func__);
1838  unregister_pernet_device(&ipgre_net_ops);
1839 }
1840 
1841 module_init(ipgre_init);
1842 module_exit(ipgre_fini);
1843 MODULE_LICENSE("GPL");
1844 MODULE_ALIAS_RTNL_LINK("gre");
1845 MODULE_ALIAS_RTNL_LINK("gretap");
1846 MODULE_ALIAS_NETDEV("gre0");