Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
route.c
Go to the documentation of this file.
1 /*
2  * INET An implementation of the TCP/IP protocol suite for the LINUX
3  * operating system. INET is implemented using the BSD Socket
4  * interface as the means of communication with the user level.
5  *
6  * ROUTE - implementation of the IP router.
7  *
8  * Authors: Ross Biro
9  * Fred N. van Kempen, <[email protected]>
10  * Alan Cox, <[email protected]>
11  * Linus Torvalds, <[email protected]>
12  * Alexey Kuznetsov, <[email protected]>
13  *
14  * Fixes:
15  * Alan Cox : Verify area fixes.
16  * Alan Cox : cli() protects routing changes
17  * Rui Oliveira : ICMP routing table updates
18  * ([email protected]) Routing table insertion and update
19  * Linus Torvalds : Rewrote bits to be sensible
20  * Alan Cox : Added BSD route gw semantics
21  * Alan Cox : Super /proc >4K
22  * Alan Cox : MTU in route table
23  * Alan Cox : MSS actually. Also added the window
24  * clamper.
25  * Sam Lantinga : Fixed route matching in rt_del()
26  * Alan Cox : Routing cache support.
27  * Alan Cox : Removed compatibility cruft.
28  * Alan Cox : RTF_REJECT support.
29  * Alan Cox : TCP irtt support.
30  * Jonathan Naylor : Added Metric support.
31  * Miquel van Smoorenburg : BSD API fixes.
32  * Miquel van Smoorenburg : Metrics.
33  * Alan Cox : Use __u32 properly
34  * Alan Cox : Aligned routing errors more closely with BSD
35  * our system is still very different.
36  * Alan Cox : Faster /proc handling
37  * Alexey Kuznetsov : Massive rework to support tree based routing,
38  * routing caches and better behaviour.
39  *
40  * Olaf Erb : irtt wasn't being copied right.
41  * Bjorn Ekwall : Kerneld route support.
42  * Alan Cox : Multicast fixed (I hope)
43  * Pavel Krauz : Limited broadcast fixed
44  * Mike McLagan : Routing by source
45  * Alexey Kuznetsov : End of old history. Split to fib.c and
46  * route.c and rewritten from scratch.
47  * Andi Kleen : Load-limit warning messages.
48  * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49  * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50  * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51  * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52  * Marc Boucher : routing by fwmark
53  * Robert Olsson : Added rt_cache statistics
54  * Arnaldo C. Melo : Convert proc stuff to seq_file
55  * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56  * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57  * Ilia Sotnikov : Removed TOS from hash calculations
58  *
59  * This program is free software; you can redistribute it and/or
60  * modify it under the terms of the GNU General Public License
61  * as published by the Free Software Foundation; either version
62  * 2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111 
112 #define RT_FL_TOS(oldflp4) \
113  ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114 
115 #define IP_MAX_MTU 0xFFF0
116 
117 #define RT_GC_TIMEOUT (300*HZ)
118 
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly = 8;
129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly = 256;
132 
133 /*
134  * Interface to generic destination cache.
135  */
136 
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143  struct sk_buff *skb, u32 mtu);
144 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145  struct sk_buff *skb);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147 
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149  int how)
150 {
151 }
152 
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155  WARN_ON(1);
156  return NULL;
157 }
158 
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160  struct sk_buff *skb,
161  const void *daddr);
162 
163 static struct dst_ops ipv4_dst_ops = {
164  .family = AF_INET,
165  .protocol = cpu_to_be16(ETH_P_IP),
166  .check = ipv4_dst_check,
167  .default_advmss = ipv4_default_advmss,
168  .mtu = ipv4_mtu,
169  .cow_metrics = ipv4_cow_metrics,
170  .destroy = ipv4_dst_destroy,
171  .ifdown = ipv4_dst_ifdown,
172  .negative_advice = ipv4_negative_advice,
173  .link_failure = ipv4_link_failure,
174  .update_pmtu = ip_rt_update_pmtu,
175  .redirect = ip_do_redirect,
176  .local_out = __ip_local_out,
177  .neigh_lookup = ipv4_neigh_lookup,
178 };
179 
180 #define ECN_OR_COST(class) TC_PRIO_##class
181 
182 const __u8 ip_tos2prio[16] = {
184  ECN_OR_COST(BESTEFFORT),
186  ECN_OR_COST(BESTEFFORT),
187  TC_PRIO_BULK,
188  ECN_OR_COST(BULK),
189  TC_PRIO_BULK,
190  ECN_OR_COST(BULK),
192  ECN_OR_COST(INTERACTIVE),
194  ECN_OR_COST(INTERACTIVE),
196  ECN_OR_COST(INTERACTIVE_BULK),
198  ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201 
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204 
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208  if (*pos)
209  return NULL;
210  return SEQ_START_TOKEN;
211 }
212 
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215  ++*pos;
216  return NULL;
217 }
218 
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222 
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225  if (v == SEQ_START_TOKEN)
226  seq_printf(seq, "%-127s\n",
227  "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228  "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229  "HHUptod\tSpecDst");
230  return 0;
231 }
232 
233 static const struct seq_operations rt_cache_seq_ops = {
234  .start = rt_cache_seq_start,
235  .next = rt_cache_seq_next,
236  .stop = rt_cache_seq_stop,
237  .show = rt_cache_seq_show,
238 };
239 
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242  return seq_open(file, &rt_cache_seq_ops);
243 }
244 
245 static const struct file_operations rt_cache_seq_fops = {
246  .owner = THIS_MODULE,
247  .open = rt_cache_seq_open,
248  .read = seq_read,
249  .llseek = seq_lseek,
250  .release = seq_release,
251 };
252 
253 
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256  int cpu;
257 
258  if (*pos == 0)
259  return SEQ_START_TOKEN;
260 
261  for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262  if (!cpu_possible(cpu))
263  continue;
264  *pos = cpu+1;
265  return &per_cpu(rt_cache_stat, cpu);
266  }
267  return NULL;
268 }
269 
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272  int cpu;
273 
274  for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275  if (!cpu_possible(cpu))
276  continue;
277  *pos = cpu+1;
278  return &per_cpu(rt_cache_stat, cpu);
279  }
280  return NULL;
281 
282 }
283 
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286 
287 }
288 
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291  struct rt_cache_stat *st = v;
292 
293  if (v == SEQ_START_TOKEN) {
294  seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295  return 0;
296  }
297 
298  seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299  " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300  dst_entries_get_slow(&ipv4_dst_ops),
301  st->in_hit,
302  st->in_slow_tot,
303  st->in_slow_mc,
304  st->in_no_route,
305  st->in_brd,
306  st->in_martian_dst,
307  st->in_martian_src,
308 
309  st->out_hit,
310  st->out_slow_tot,
311  st->out_slow_mc,
312 
313  st->gc_total,
314  st->gc_ignored,
315  st->gc_goal_miss,
316  st->gc_dst_overflow,
317  st->in_hlist_search,
318  st->out_hlist_search
319  );
320  return 0;
321 }
322 
323 static const struct seq_operations rt_cpu_seq_ops = {
324  .start = rt_cpu_seq_start,
325  .next = rt_cpu_seq_next,
326  .stop = rt_cpu_seq_stop,
327  .show = rt_cpu_seq_show,
328 };
329 
330 
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333  return seq_open(file, &rt_cpu_seq_ops);
334 }
335 
336 static const struct file_operations rt_cpu_seq_fops = {
337  .owner = THIS_MODULE,
338  .open = rt_cpu_seq_open,
339  .read = seq_read,
340  .llseek = seq_lseek,
341  .release = seq_release,
342 };
343 
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347  struct ip_rt_acct *dst, *src;
348  unsigned int i, j;
349 
350  dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351  if (!dst)
352  return -ENOMEM;
353 
355  src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356  for (j = 0; j < 256; j++) {
357  dst[j].o_bytes += src[j].o_bytes;
358  dst[j].o_packets += src[j].o_packets;
359  dst[j].i_bytes += src[j].i_bytes;
360  dst[j].i_packets += src[j].i_packets;
361  }
362  }
363 
364  seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365  kfree(dst);
366  return 0;
367 }
368 
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371  return single_open(file, rt_acct_proc_show, NULL);
372 }
373 
374 static const struct file_operations rt_acct_proc_fops = {
375  .owner = THIS_MODULE,
376  .open = rt_acct_proc_open,
377  .read = seq_read,
378  .llseek = seq_lseek,
379  .release = single_release,
380 };
381 #endif
382 
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385  struct proc_dir_entry *pde;
386 
387  pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388  &rt_cache_seq_fops);
389  if (!pde)
390  goto err1;
391 
392  pde = proc_create("rt_cache", S_IRUGO,
393  net->proc_net_stat, &rt_cpu_seq_fops);
394  if (!pde)
395  goto err2;
396 
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398  pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399  if (!pde)
400  goto err3;
401 #endif
402  return 0;
403 
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406  remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409  remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411  return -ENOMEM;
412 }
413 
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416  remove_proc_entry("rt_cache", net->proc_net_stat);
417  remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419  remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422 
423 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424  .init = ip_rt_do_proc_init,
425  .exit = ip_rt_do_proc_exit,
426 };
427 
428 static int __init ip_rt_proc_init(void)
429 {
430  return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432 
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436  return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439 
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442  return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444 
445 void rt_cache_flush(struct net *net)
446 {
447  rt_genid_bump(net);
448 }
449 
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451  struct sk_buff *skb,
452  const void *daddr)
453 {
454  struct net_device *dev = dst->dev;
455  const __be32 *pkey = daddr;
456  const struct rtable *rt;
457  struct neighbour *n;
458 
459  rt = (const struct rtable *) dst;
460  if (rt->rt_gateway)
461  pkey = (const __be32 *) &rt->rt_gateway;
462  else if (skb)
463  pkey = &ip_hdr(skb)->daddr;
464 
465  n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466  if (n)
467  return n;
468  return neigh_create(&arp_tbl, pkey, dev);
469 }
470 
471 /*
472  * Peer allocation may fail only in serious out-of-memory conditions. However
473  * we still can generate some output.
474  * Random ID selection looks a bit dangerous because we have no chances to
475  * select ID being unique in a reasonable period of time.
476  * But broken packet identifier may be better than no packet at all.
477  */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480  static DEFINE_SPINLOCK(ip_fb_id_lock);
481  static u32 ip_fallback_id;
482  u32 salt;
483 
484  spin_lock_bh(&ip_fb_id_lock);
485  salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486  iph->id = htons(salt & 0xFFFF);
487  ip_fallback_id = salt;
488  spin_unlock_bh(&ip_fb_id_lock);
489 }
490 
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493  struct net *net = dev_net(dst->dev);
494  struct inet_peer *peer;
495 
496  peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497  if (peer) {
498  iph->id = htons(inet_getid(peer, more));
499  inet_putpeer(peer);
500  return;
501  }
502 
503  ip_select_fb_ident(iph);
504 }
506 
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508  const struct iphdr *iph,
509  int oif, u8 tos,
510  u8 prot, u32 mark, int flow_flags)
511 {
512  if (sk) {
513  const struct inet_sock *inet = inet_sk(sk);
514 
515  oif = sk->sk_bound_dev_if;
516  mark = sk->sk_mark;
517  tos = RT_CONN_FLAGS(sk);
518  prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519  }
520  flowi4_init_output(fl4, oif, mark, tos,
521  RT_SCOPE_UNIVERSE, prot,
522  flow_flags,
523  iph->daddr, iph->saddr, 0, 0);
524 }
525 
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527  const struct sock *sk)
528 {
529  const struct iphdr *iph = ip_hdr(skb);
530  int oif = skb->dev->ifindex;
531  u8 tos = RT_TOS(iph->tos);
532  u8 prot = iph->protocol;
533  u32 mark = skb->mark;
534 
535  __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537 
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540  const struct inet_sock *inet = inet_sk(sk);
541  const struct ip_options_rcu *inet_opt;
542  __be32 daddr = inet->inet_daddr;
543 
544  rcu_read_lock();
545  inet_opt = rcu_dereference(inet->inet_opt);
546  if (inet_opt && inet_opt->opt.srr)
547  daddr = inet_opt->opt.faddr;
548  flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
550  inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551  inet_sk_flowi_flags(sk),
552  daddr, inet->inet_saddr, 0, 0);
553  rcu_read_unlock();
554 }
555 
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557  const struct sk_buff *skb)
558 {
559  if (skb)
560  build_skb_flow_key(fl4, skb, sk);
561  else
562  build_sk_flow_key(fl4, sk);
563 }
564 
565 static inline void rt_free(struct rtable *rt)
566 {
567  call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569 
570 static DEFINE_SPINLOCK(fnhe_lock);
571 
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574  struct fib_nh_exception *fnhe, *oldest;
575  struct rtable *orig;
576 
577  oldest = rcu_dereference(hash->chain);
578  for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579  fnhe = rcu_dereference(fnhe->fnhe_next)) {
580  if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581  oldest = fnhe;
582  }
583  orig = rcu_dereference(oldest->fnhe_rth);
584  if (orig) {
585  RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586  rt_free(orig);
587  }
588  return oldest;
589 }
590 
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593  u32 hval;
594 
595  hval = (__force u32) daddr;
596  hval ^= (hval >> 11) ^ (hval >> 22);
597 
598  return hval & (FNHE_HASH_SIZE - 1);
599 }
600 
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602  u32 pmtu, unsigned long expires)
603 {
604  struct fnhe_hash_bucket *hash;
605  struct fib_nh_exception *fnhe;
606  int depth;
607  u32 hval = fnhe_hashfun(daddr);
608 
609  spin_lock_bh(&fnhe_lock);
610 
611  hash = nh->nh_exceptions;
612  if (!hash) {
613  hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614  if (!hash)
615  goto out_unlock;
616  nh->nh_exceptions = hash;
617  }
618 
619  hash += hval;
620 
621  depth = 0;
622  for (fnhe = rcu_dereference(hash->chain); fnhe;
623  fnhe = rcu_dereference(fnhe->fnhe_next)) {
624  if (fnhe->fnhe_daddr == daddr)
625  break;
626  depth++;
627  }
628 
629  if (fnhe) {
630  if (gw)
631  fnhe->fnhe_gw = gw;
632  if (pmtu) {
633  fnhe->fnhe_pmtu = pmtu;
634  fnhe->fnhe_expires = expires;
635  }
636  } else {
637  if (depth > FNHE_RECLAIM_DEPTH)
638  fnhe = fnhe_oldest(hash);
639  else {
640  fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641  if (!fnhe)
642  goto out_unlock;
643 
644  fnhe->fnhe_next = hash->chain;
645  rcu_assign_pointer(hash->chain, fnhe);
646  }
647  fnhe->fnhe_daddr = daddr;
648  fnhe->fnhe_gw = gw;
649  fnhe->fnhe_pmtu = pmtu;
650  fnhe->fnhe_expires = expires;
651  }
652 
653  fnhe->fnhe_stamp = jiffies;
654 
655 out_unlock:
656  spin_unlock_bh(&fnhe_lock);
657  return;
658 }
659 
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661  bool kill_route)
662 {
663  __be32 new_gw = icmp_hdr(skb)->un.gateway;
664  __be32 old_gw = ip_hdr(skb)->saddr;
665  struct net_device *dev = skb->dev;
666  struct in_device *in_dev;
667  struct fib_result res;
668  struct neighbour *n;
669  struct net *net;
670 
671  switch (icmp_hdr(skb)->code & 7) {
672  case ICMP_REDIR_NET:
673  case ICMP_REDIR_NETTOS:
674  case ICMP_REDIR_HOST:
675  case ICMP_REDIR_HOSTTOS:
676  break;
677 
678  default:
679  return;
680  }
681 
682  if (rt->rt_gateway != old_gw)
683  return;
684 
685  in_dev = __in_dev_get_rcu(dev);
686  if (!in_dev)
687  return;
688 
689  net = dev_net(dev);
690  if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691  ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692  ipv4_is_zeronet(new_gw))
693  goto reject_redirect;
694 
695  if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696  if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697  goto reject_redirect;
698  if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699  goto reject_redirect;
700  } else {
701  if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702  goto reject_redirect;
703  }
704 
705  n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706  if (n) {
707  if (!(n->nud_state & NUD_VALID)) {
708  neigh_event_send(n, NULL);
709  } else {
710  if (fib_lookup(net, fl4, &res) == 0) {
711  struct fib_nh *nh = &FIB_RES_NH(res);
712 
713  update_or_create_fnhe(nh, fl4->daddr, new_gw,
714  0, 0);
715  }
716  if (kill_route)
717  rt->dst.obsolete = DST_OBSOLETE_KILL;
719  }
720  neigh_release(n);
721  }
722  return;
723 
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726  if (IN_DEV_LOG_MARTIANS(in_dev)) {
727  const struct iphdr *iph = (const struct iphdr *) skb->data;
728  __be32 daddr = iph->daddr;
729  __be32 saddr = iph->saddr;
730 
731  net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732  " Advised path = %pI4 -> %pI4\n",
733  &old_gw, dev->name, &new_gw,
734  &saddr, &daddr);
735  }
736 #endif
737  ;
738 }
739 
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742  struct rtable *rt;
743  struct flowi4 fl4;
744 
745  rt = (struct rtable *) dst;
746 
747  ip_rt_build_flow_key(&fl4, sk, skb);
748  __ip_do_redirect(rt, skb, &fl4, true);
749 }
750 
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753  struct rtable *rt = (struct rtable *)dst;
754  struct dst_entry *ret = dst;
755 
756  if (rt) {
757  if (dst->obsolete > 0) {
758  ip_rt_put(rt);
759  ret = NULL;
760  } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761  rt->dst.expires) {
762  ip_rt_put(rt);
763  ret = NULL;
764  }
765  }
766  return ret;
767 }
768 
769 /*
770  * Algorithm:
771  * 1. The first ip_rt_redirect_number redirects are sent
772  * with exponential backoff, then we stop sending them at all,
773  * assuming that the host ignores our redirects.
774  * 2. If we did not see packets requiring redirects
775  * during ip_rt_redirect_silence, we assume that the host
776  * forgot redirected route and start to send redirects again.
777  *
778  * This algorithm is much cheaper and more intelligent than dumb load limiting
779  * in icmp.c.
780  *
781  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782  * and "frag. need" (breaks PMTU discovery) in icmp.c.
783  */
784 
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787  struct rtable *rt = skb_rtable(skb);
788  struct in_device *in_dev;
789  struct inet_peer *peer;
790  struct net *net;
791  int log_martians;
792 
793  rcu_read_lock();
794  in_dev = __in_dev_get_rcu(rt->dst.dev);
795  if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796  rcu_read_unlock();
797  return;
798  }
799  log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800  rcu_read_unlock();
801 
802  net = dev_net(rt->dst.dev);
803  peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804  if (!peer) {
806  rt_nexthop(rt, ip_hdr(skb)->daddr));
807  return;
808  }
809 
810  /* No redirected packets during ip_rt_redirect_silence;
811  * reset the algorithm.
812  */
813  if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
814  peer->rate_tokens = 0;
815 
816  /* Too many ignored redirects; do not send anything
817  * set dst.rate_last to the last seen redirected packet.
818  */
819  if (peer->rate_tokens >= ip_rt_redirect_number) {
820  peer->rate_last = jiffies;
821  goto out_put_peer;
822  }
823 
824  /* Check for load limit; set rate_last to the latest sent
825  * redirect.
826  */
827  if (peer->rate_tokens == 0 ||
829  (peer->rate_last +
830  (ip_rt_redirect_load << peer->rate_tokens)))) {
831  __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
832 
834  peer->rate_last = jiffies;
835  ++peer->rate_tokens;
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837  if (log_martians &&
838  peer->rate_tokens == ip_rt_redirect_number)
839  net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
840  &ip_hdr(skb)->saddr, inet_iif(skb),
841  &ip_hdr(skb)->daddr, &gw);
842 #endif
843  }
844 out_put_peer:
845  inet_putpeer(peer);
846 }
847 
848 static int ip_error(struct sk_buff *skb)
849 {
850  struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851  struct rtable *rt = skb_rtable(skb);
852  struct inet_peer *peer;
853  unsigned long now;
854  struct net *net;
855  bool send;
856  int code;
857 
858  net = dev_net(rt->dst.dev);
859  if (!IN_DEV_FORWARD(in_dev)) {
860  switch (rt->dst.error) {
861  case EHOSTUNREACH:
863  break;
864 
865  case ENETUNREACH:
867  break;
868  }
869  goto out;
870  }
871 
872  switch (rt->dst.error) {
873  case EINVAL:
874  default:
875  goto out;
876  case EHOSTUNREACH:
877  code = ICMP_HOST_UNREACH;
878  break;
879  case ENETUNREACH:
880  code = ICMP_NET_UNREACH;
882  break;
883  case EACCES:
884  code = ICMP_PKT_FILTERED;
885  break;
886  }
887 
888  peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
889 
890  send = true;
891  if (peer) {
892  now = jiffies;
893  peer->rate_tokens += now - peer->rate_last;
894  if (peer->rate_tokens > ip_rt_error_burst)
895  peer->rate_tokens = ip_rt_error_burst;
896  peer->rate_last = now;
897  if (peer->rate_tokens >= ip_rt_error_cost)
898  peer->rate_tokens -= ip_rt_error_cost;
899  else
900  send = false;
901  inet_putpeer(peer);
902  }
903  if (send)
904  icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
905 
906 out: kfree_skb(skb);
907  return 0;
908 }
909 
910 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
911 {
912  struct dst_entry *dst = &rt->dst;
913  struct fib_result res;
914 
915  if (dst->dev->mtu < mtu)
916  return;
917 
918  if (mtu < ip_rt_min_pmtu)
919  mtu = ip_rt_min_pmtu;
920 
921  if (!rt->rt_pmtu) {
923  } else {
924  rt->rt_pmtu = mtu;
925  dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
926  }
927 
928  rcu_read_lock();
929  if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
930  struct fib_nh *nh = &FIB_RES_NH(res);
931 
932  update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
933  jiffies + ip_rt_mtu_expires);
934  }
935  rcu_read_unlock();
936 }
937 
938 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
939  struct sk_buff *skb, u32 mtu)
940 {
941  struct rtable *rt = (struct rtable *) dst;
942  struct flowi4 fl4;
943 
944  ip_rt_build_flow_key(&fl4, sk, skb);
945  __ip_rt_update_pmtu(rt, &fl4, mtu);
946 }
947 
948 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
949  int oif, u32 mark, u8 protocol, int flow_flags)
950 {
951  const struct iphdr *iph = (const struct iphdr *) skb->data;
952  struct flowi4 fl4;
953  struct rtable *rt;
954 
955  __build_flow_key(&fl4, NULL, iph, oif,
956  RT_TOS(iph->tos), protocol, mark, flow_flags);
957  rt = __ip_route_output_key(net, &fl4);
958  if (!IS_ERR(rt)) {
959  __ip_rt_update_pmtu(rt, &fl4, mtu);
960  ip_rt_put(rt);
961  }
962 }
964 
965 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
966 {
967  const struct iphdr *iph = (const struct iphdr *) skb->data;
968  struct flowi4 fl4;
969  struct rtable *rt;
970 
971  __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
972  rt = __ip_route_output_key(sock_net(sk), &fl4);
973  if (!IS_ERR(rt)) {
974  __ip_rt_update_pmtu(rt, &fl4, mtu);
975  ip_rt_put(rt);
976  }
977 }
979 
980 void ipv4_redirect(struct sk_buff *skb, struct net *net,
981  int oif, u32 mark, u8 protocol, int flow_flags)
982 {
983  const struct iphdr *iph = (const struct iphdr *) skb->data;
984  struct flowi4 fl4;
985  struct rtable *rt;
986 
987  __build_flow_key(&fl4, NULL, iph, oif,
988  RT_TOS(iph->tos), protocol, mark, flow_flags);
989  rt = __ip_route_output_key(net, &fl4);
990  if (!IS_ERR(rt)) {
991  __ip_do_redirect(rt, skb, &fl4, false);
992  ip_rt_put(rt);
993  }
994 }
996 
997 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
998 {
999  const struct iphdr *iph = (const struct iphdr *) skb->data;
1000  struct flowi4 fl4;
1001  struct rtable *rt;
1002 
1003  __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004  rt = __ip_route_output_key(sock_net(sk), &fl4);
1005  if (!IS_ERR(rt)) {
1006  __ip_do_redirect(rt, skb, &fl4, false);
1007  ip_rt_put(rt);
1008  }
1009 }
1011 
1012 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013 {
1014  struct rtable *rt = (struct rtable *) dst;
1015 
1016  /* All IPV4 dsts are created with ->obsolete set to the value
1017  * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1018  * into this function always.
1019  *
1020  * When a PMTU/redirect information update invalidates a
1021  * route, this is indicated by setting obsolete to
1022  * DST_OBSOLETE_KILL.
1023  */
1024  if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025  return NULL;
1026  return dst;
1027 }
1028 
1029 static void ipv4_link_failure(struct sk_buff *skb)
1030 {
1031  struct rtable *rt;
1032 
1034 
1035  rt = skb_rtable(skb);
1036  if (rt)
1037  dst_set_expires(&rt->dst, 0);
1038 }
1039 
1040 static int ip_rt_bug(struct sk_buff *skb)
1041 {
1042  pr_debug("%s: %pI4 -> %pI4, %s\n",
1043  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044  skb->dev ? skb->dev->name : "?");
1045  kfree_skb(skb);
1046  WARN_ON(1);
1047  return 0;
1048 }
1049 
1050 /*
1051  We do not cache source address of outgoing interface,
1052  because it is used only by IP RR, TS and SRR options,
1053  so that it out of fast path.
1054 
1055  BTW remember: "addr" is allowed to be not aligned
1056  in IP options!
1057  */
1058 
1059 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1060 {
1061  __be32 src;
1062 
1063  if (rt_is_output_route(rt))
1064  src = ip_hdr(skb)->saddr;
1065  else {
1066  struct fib_result res;
1067  struct flowi4 fl4;
1068  struct iphdr *iph;
1069 
1070  iph = ip_hdr(skb);
1071 
1072  memset(&fl4, 0, sizeof(fl4));
1073  fl4.daddr = iph->daddr;
1074  fl4.saddr = iph->saddr;
1075  fl4.flowi4_tos = RT_TOS(iph->tos);
1076  fl4.flowi4_oif = rt->dst.dev->ifindex;
1077  fl4.flowi4_iif = skb->dev->ifindex;
1078  fl4.flowi4_mark = skb->mark;
1079 
1080  rcu_read_lock();
1081  if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082  src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083  else
1084  src = inet_select_addr(rt->dst.dev,
1085  rt_nexthop(rt, iph->daddr),
1087  rcu_read_unlock();
1088  }
1089  memcpy(addr, &src, 4);
1090 }
1091 
1092 #ifdef CONFIG_IP_ROUTE_CLASSID
1093 static void set_class_tag(struct rtable *rt, u32 tag)
1094 {
1095  if (!(rt->dst.tclassid & 0xFFFF))
1096  rt->dst.tclassid |= tag & 0xFFFF;
1097  if (!(rt->dst.tclassid & 0xFFFF0000))
1098  rt->dst.tclassid |= tag & 0xFFFF0000;
1099 }
1100 #endif
1101 
1102 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1103 {
1104  unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1105 
1106  if (advmss == 0) {
1107  advmss = max_t(unsigned int, dst->dev->mtu - 40,
1108  ip_rt_min_advmss);
1109  if (advmss > 65535 - 40)
1110  advmss = 65535 - 40;
1111  }
1112  return advmss;
1113 }
1114 
1115 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1116 {
1117  const struct rtable *rt = (const struct rtable *) dst;
1118  unsigned int mtu = rt->rt_pmtu;
1119 
1120  if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1121  mtu = dst_metric_raw(dst, RTAX_MTU);
1122 
1123  if (mtu && rt_is_output_route(rt))
1124  return mtu;
1125 
1126  mtu = dst->dev->mtu;
1127 
1128  if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1129  if (rt->rt_uses_gateway && mtu > 576)
1130  mtu = 576;
1131  }
1132 
1133  if (mtu > IP_MAX_MTU)
1134  mtu = IP_MAX_MTU;
1135 
1136  return mtu;
1137 }
1138 
1139 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1140 {
1141  struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1142  struct fib_nh_exception *fnhe;
1143  u32 hval;
1144 
1145  if (!hash)
1146  return NULL;
1147 
1148  hval = fnhe_hashfun(daddr);
1149 
1150  for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1151  fnhe = rcu_dereference(fnhe->fnhe_next)) {
1152  if (fnhe->fnhe_daddr == daddr)
1153  return fnhe;
1154  }
1155  return NULL;
1156 }
1157 
1158 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1159  __be32 daddr)
1160 {
1161  bool ret = false;
1162 
1163  spin_lock_bh(&fnhe_lock);
1164 
1165  if (daddr == fnhe->fnhe_daddr) {
1166  struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1167  if (orig && rt_is_expired(orig)) {
1168  fnhe->fnhe_gw = 0;
1169  fnhe->fnhe_pmtu = 0;
1170  fnhe->fnhe_expires = 0;
1171  }
1172  if (fnhe->fnhe_pmtu) {
1173  unsigned long expires = fnhe->fnhe_expires;
1174  unsigned long diff = expires - jiffies;
1175 
1176  if (time_before(jiffies, expires)) {
1177  rt->rt_pmtu = fnhe->fnhe_pmtu;
1178  dst_set_expires(&rt->dst, diff);
1179  }
1180  }
1181  if (fnhe->fnhe_gw) {
1182  rt->rt_flags |= RTCF_REDIRECTED;
1183  rt->rt_gateway = fnhe->fnhe_gw;
1184  rt->rt_uses_gateway = 1;
1185  } else if (!rt->rt_gateway)
1186  rt->rt_gateway = daddr;
1187 
1188  rcu_assign_pointer(fnhe->fnhe_rth, rt);
1189  if (orig)
1190  rt_free(orig);
1191 
1192  fnhe->fnhe_stamp = jiffies;
1193  ret = true;
1194  }
1195  spin_unlock_bh(&fnhe_lock);
1196 
1197  return ret;
1198 }
1199 
1200 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1201 {
1202  struct rtable *orig, *prev, **p;
1203  bool ret = true;
1204 
1205  if (rt_is_input_route(rt)) {
1206  p = (struct rtable **)&nh->nh_rth_input;
1207  } else {
1208  p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1209  }
1210  orig = *p;
1211 
1212  prev = cmpxchg(p, orig, rt);
1213  if (prev == orig) {
1214  if (orig)
1215  rt_free(orig);
1216  } else
1217  ret = false;
1218 
1219  return ret;
1220 }
1221 
1222 static DEFINE_SPINLOCK(rt_uncached_lock);
1223 static LIST_HEAD(rt_uncached_list);
1224 
1225 static void rt_add_uncached_list(struct rtable *rt)
1226 {
1227  spin_lock_bh(&rt_uncached_lock);
1228  list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1229  spin_unlock_bh(&rt_uncached_lock);
1230 }
1231 
1232 static void ipv4_dst_destroy(struct dst_entry *dst)
1233 {
1234  struct rtable *rt = (struct rtable *) dst;
1235 
1236  if (!list_empty(&rt->rt_uncached)) {
1237  spin_lock_bh(&rt_uncached_lock);
1238  list_del(&rt->rt_uncached);
1239  spin_unlock_bh(&rt_uncached_lock);
1240  }
1241 }
1242 
1243 void rt_flush_dev(struct net_device *dev)
1244 {
1245  if (!list_empty(&rt_uncached_list)) {
1246  struct net *net = dev_net(dev);
1247  struct rtable *rt;
1248 
1249  spin_lock_bh(&rt_uncached_lock);
1250  list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1251  if (rt->dst.dev != dev)
1252  continue;
1253  rt->dst.dev = net->loopback_dev;
1254  dev_hold(rt->dst.dev);
1255  dev_put(dev);
1256  }
1257  spin_unlock_bh(&rt_uncached_lock);
1258  }
1259 }
1260 
1261 static bool rt_cache_valid(const struct rtable *rt)
1262 {
1263  return rt &&
1264  rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1265  !rt_is_expired(rt);
1266 }
1267 
1268 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1269  const struct fib_result *res,
1270  struct fib_nh_exception *fnhe,
1271  struct fib_info *fi, u16 type, u32 itag)
1272 {
1273  bool cached = false;
1274 
1275  if (fi) {
1276  struct fib_nh *nh = &FIB_RES_NH(*res);
1277 
1278  if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1279  rt->rt_gateway = nh->nh_gw;
1280  rt->rt_uses_gateway = 1;
1281  }
1282  dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1283 #ifdef CONFIG_IP_ROUTE_CLASSID
1284  rt->dst.tclassid = nh->nh_tclassid;
1285 #endif
1286  if (unlikely(fnhe))
1287  cached = rt_bind_exception(rt, fnhe, daddr);
1288  else if (!(rt->dst.flags & DST_NOCACHE))
1289  cached = rt_cache_route(nh, rt);
1290  if (unlikely(!cached)) {
1291  /* Routes we intend to cache in nexthop exception or
1292  * FIB nexthop have the DST_NOCACHE bit clear.
1293  * However, if we are unsuccessful at storing this
1294  * route into the cache we really need to set it.
1295  */
1296  rt->dst.flags |= DST_NOCACHE;
1297  if (!rt->rt_gateway)
1298  rt->rt_gateway = daddr;
1299  rt_add_uncached_list(rt);
1300  }
1301  } else
1302  rt_add_uncached_list(rt);
1303 
1304 #ifdef CONFIG_IP_ROUTE_CLASSID
1305 #ifdef CONFIG_IP_MULTIPLE_TABLES
1306  set_class_tag(rt, res->tclassid);
1307 #endif
1308  set_class_tag(rt, itag);
1309 #endif
1310 }
1311 
1312 static struct rtable *rt_dst_alloc(struct net_device *dev,
1313  bool nopolicy, bool noxfrm, bool will_cache)
1314 {
1315  return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1316  (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1317  (nopolicy ? DST_NOPOLICY : 0) |
1318  (noxfrm ? DST_NOXFRM : 0));
1319 }
1320 
1321 /* called in rcu_read_lock() section */
1322 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1323  u8 tos, struct net_device *dev, int our)
1324 {
1325  struct rtable *rth;
1326  struct in_device *in_dev = __in_dev_get_rcu(dev);
1327  u32 itag = 0;
1328  int err;
1329 
1330  /* Primary sanity checks. */
1331 
1332  if (in_dev == NULL)
1333  return -EINVAL;
1334 
1335  if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1336  skb->protocol != htons(ETH_P_IP))
1337  goto e_inval;
1338 
1339  if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1340  if (ipv4_is_loopback(saddr))
1341  goto e_inval;
1342 
1343  if (ipv4_is_zeronet(saddr)) {
1344  if (!ipv4_is_local_multicast(daddr))
1345  goto e_inval;
1346  } else {
1347  err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1348  in_dev, &itag);
1349  if (err < 0)
1350  goto e_err;
1351  }
1352  rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1353  IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1354  if (!rth)
1355  goto e_nobufs;
1356 
1357 #ifdef CONFIG_IP_ROUTE_CLASSID
1358  rth->dst.tclassid = itag;
1359 #endif
1360  rth->dst.output = ip_rt_bug;
1361 
1362  rth->rt_genid = rt_genid(dev_net(dev));
1363  rth->rt_flags = RTCF_MULTICAST;
1364  rth->rt_type = RTN_MULTICAST;
1365  rth->rt_is_input= 1;
1366  rth->rt_iif = 0;
1367  rth->rt_pmtu = 0;
1368  rth->rt_gateway = 0;
1369  rth->rt_uses_gateway = 0;
1370  INIT_LIST_HEAD(&rth->rt_uncached);
1371  if (our) {
1372  rth->dst.input= ip_local_deliver;
1373  rth->rt_flags |= RTCF_LOCAL;
1374  }
1375 
1376 #ifdef CONFIG_IP_MROUTE
1377  if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1378  rth->dst.input = ip_mr_input;
1379 #endif
1380  RT_CACHE_STAT_INC(in_slow_mc);
1381 
1382  skb_dst_set(skb, &rth->dst);
1383  return 0;
1384 
1385 e_nobufs:
1386  return -ENOBUFS;
1387 e_inval:
1388  return -EINVAL;
1389 e_err:
1390  return err;
1391 }
1392 
1393 
1394 static void ip_handle_martian_source(struct net_device *dev,
1395  struct in_device *in_dev,
1396  struct sk_buff *skb,
1397  __be32 daddr,
1398  __be32 saddr)
1399 {
1400  RT_CACHE_STAT_INC(in_martian_src);
1401 #ifdef CONFIG_IP_ROUTE_VERBOSE
1402  if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1403  /*
1404  * RFC1812 recommendation, if source is martian,
1405  * the only hint is MAC header.
1406  */
1407  pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1408  &daddr, &saddr, dev->name);
1409  if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1410  print_hex_dump(KERN_WARNING, "ll header: ",
1411  DUMP_PREFIX_OFFSET, 16, 1,
1412  skb_mac_header(skb),
1413  dev->hard_header_len, true);
1414  }
1415  }
1416 #endif
1417 }
1418 
1419 /* called in rcu_read_lock() section */
1420 static int __mkroute_input(struct sk_buff *skb,
1421  const struct fib_result *res,
1422  struct in_device *in_dev,
1423  __be32 daddr, __be32 saddr, u32 tos)
1424 {
1425  struct rtable *rth;
1426  int err;
1427  struct in_device *out_dev;
1428  unsigned int flags = 0;
1429  bool do_cache;
1430  u32 itag;
1431 
1432  /* get a working reference to the output device */
1433  out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1434  if (out_dev == NULL) {
1435  net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1436  return -EINVAL;
1437  }
1438 
1439  err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1440  in_dev->dev, in_dev, &itag);
1441  if (err < 0) {
1442  ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1443  saddr);
1444 
1445  goto cleanup;
1446  }
1447 
1448  do_cache = res->fi && !itag;
1449  if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1450  (IN_DEV_SHARED_MEDIA(out_dev) ||
1451  inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1452  flags |= RTCF_DOREDIRECT;
1453  do_cache = false;
1454  }
1455 
1456  if (skb->protocol != htons(ETH_P_IP)) {
1457  /* Not IP (i.e. ARP). Do not create route, if it is
1458  * invalid for proxy arp. DNAT routes are always valid.
1459  *
1460  * Proxy arp feature have been extended to allow, ARP
1461  * replies back to the same interface, to support
1462  * Private VLAN switch technologies. See arp.c.
1463  */
1464  if (out_dev == in_dev &&
1465  IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1466  err = -EINVAL;
1467  goto cleanup;
1468  }
1469  }
1470 
1471  if (do_cache) {
1472  rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1473  if (rt_cache_valid(rth)) {
1474  skb_dst_set_noref(skb, &rth->dst);
1475  goto out;
1476  }
1477  }
1478 
1479  rth = rt_dst_alloc(out_dev->dev,
1480  IN_DEV_CONF_GET(in_dev, NOPOLICY),
1481  IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1482  if (!rth) {
1483  err = -ENOBUFS;
1484  goto cleanup;
1485  }
1486 
1487  rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1488  rth->rt_flags = flags;
1489  rth->rt_type = res->type;
1490  rth->rt_is_input = 1;
1491  rth->rt_iif = 0;
1492  rth->rt_pmtu = 0;
1493  rth->rt_gateway = 0;
1494  rth->rt_uses_gateway = 0;
1495  INIT_LIST_HEAD(&rth->rt_uncached);
1496 
1497  rth->dst.input = ip_forward;
1498  rth->dst.output = ip_output;
1499 
1500  rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1501  skb_dst_set(skb, &rth->dst);
1502 out:
1503  err = 0;
1504  cleanup:
1505  return err;
1506 }
1507 
1508 static int ip_mkroute_input(struct sk_buff *skb,
1509  struct fib_result *res,
1510  const struct flowi4 *fl4,
1511  struct in_device *in_dev,
1512  __be32 daddr, __be32 saddr, u32 tos)
1513 {
1514 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1515  if (res->fi && res->fi->fib_nhs > 1)
1516  fib_select_multipath(res);
1517 #endif
1518 
1519  /* create a routing cache entry */
1520  return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1521 }
1522 
1523 /*
1524  * NOTE. We drop all the packets that has local source
1525  * addresses, because every properly looped back packet
1526  * must have correct destination already attached by output routine.
1527  *
1528  * Such approach solves two big problems:
1529  * 1. Not simplex devices are handled properly.
1530  * 2. IP spoofing attempts are filtered with 100% of guarantee.
1531  * called with rcu_read_lock()
1532  */
1533 
1534 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1535  u8 tos, struct net_device *dev)
1536 {
1537  struct fib_result res;
1538  struct in_device *in_dev = __in_dev_get_rcu(dev);
1539  struct flowi4 fl4;
1540  unsigned int flags = 0;
1541  u32 itag = 0;
1542  struct rtable *rth;
1543  int err = -EINVAL;
1544  struct net *net = dev_net(dev);
1545  bool do_cache;
1546 
1547  /* IP on this device is disabled. */
1548 
1549  if (!in_dev)
1550  goto out;
1551 
1552  /* Check for the most weird martians, which can be not detected
1553  by fib_lookup.
1554  */
1555 
1556  if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1557  goto martian_source;
1558 
1559  res.fi = NULL;
1560  if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1561  goto brd_input;
1562 
1563  /* Accept zero addresses only to limited broadcast;
1564  * I even do not know to fix it or not. Waiting for complains :-)
1565  */
1566  if (ipv4_is_zeronet(saddr))
1567  goto martian_source;
1568 
1569  if (ipv4_is_zeronet(daddr))
1570  goto martian_destination;
1571 
1572  /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1573  * and call it once if daddr or/and saddr are loopback addresses
1574  */
1575  if (ipv4_is_loopback(daddr)) {
1576  if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1577  goto martian_destination;
1578  } else if (ipv4_is_loopback(saddr)) {
1579  if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1580  goto martian_source;
1581  }
1582 
1583  /*
1584  * Now we are ready to route packet.
1585  */
1586  fl4.flowi4_oif = 0;
1587  fl4.flowi4_iif = dev->ifindex;
1588  fl4.flowi4_mark = skb->mark;
1589  fl4.flowi4_tos = tos;
1590  fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1591  fl4.daddr = daddr;
1592  fl4.saddr = saddr;
1593  err = fib_lookup(net, &fl4, &res);
1594  if (err != 0)
1595  goto no_route;
1596 
1597  RT_CACHE_STAT_INC(in_slow_tot);
1598 
1599  if (res.type == RTN_BROADCAST)
1600  goto brd_input;
1601 
1602  if (res.type == RTN_LOCAL) {
1603  err = fib_validate_source(skb, saddr, daddr, tos,
1605  dev, in_dev, &itag);
1606  if (err < 0)
1607  goto martian_source_keep_err;
1608  goto local_input;
1609  }
1610 
1611  if (!IN_DEV_FORWARD(in_dev))
1612  goto no_route;
1613  if (res.type != RTN_UNICAST)
1614  goto martian_destination;
1615 
1616  err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1617 out: return err;
1618 
1619 brd_input:
1620  if (skb->protocol != htons(ETH_P_IP))
1621  goto e_inval;
1622 
1623  if (!ipv4_is_zeronet(saddr)) {
1624  err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1625  in_dev, &itag);
1626  if (err < 0)
1627  goto martian_source_keep_err;
1628  }
1629  flags |= RTCF_BROADCAST;
1630  res.type = RTN_BROADCAST;
1631  RT_CACHE_STAT_INC(in_brd);
1632 
1633 local_input:
1634  do_cache = false;
1635  if (res.fi) {
1636  if (!itag) {
1637  rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1638  if (rt_cache_valid(rth)) {
1639  skb_dst_set_noref(skb, &rth->dst);
1640  err = 0;
1641  goto out;
1642  }
1643  do_cache = true;
1644  }
1645  }
1646 
1647  rth = rt_dst_alloc(net->loopback_dev,
1648  IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1649  if (!rth)
1650  goto e_nobufs;
1651 
1652  rth->dst.input= ip_local_deliver;
1653  rth->dst.output= ip_rt_bug;
1654 #ifdef CONFIG_IP_ROUTE_CLASSID
1655  rth->dst.tclassid = itag;
1656 #endif
1657 
1658  rth->rt_genid = rt_genid(net);
1659  rth->rt_flags = flags|RTCF_LOCAL;
1660  rth->rt_type = res.type;
1661  rth->rt_is_input = 1;
1662  rth->rt_iif = 0;
1663  rth->rt_pmtu = 0;
1664  rth->rt_gateway = 0;
1665  rth->rt_uses_gateway = 0;
1666  INIT_LIST_HEAD(&rth->rt_uncached);
1667  if (res.type == RTN_UNREACHABLE) {
1668  rth->dst.input= ip_error;
1669  rth->dst.error= -err;
1670  rth->rt_flags &= ~RTCF_LOCAL;
1671  }
1672  if (do_cache)
1673  rt_cache_route(&FIB_RES_NH(res), rth);
1674  skb_dst_set(skb, &rth->dst);
1675  err = 0;
1676  goto out;
1677 
1678 no_route:
1679  RT_CACHE_STAT_INC(in_no_route);
1680  res.type = RTN_UNREACHABLE;
1681  if (err == -ESRCH)
1682  err = -ENETUNREACH;
1683  goto local_input;
1684 
1685  /*
1686  * Do not cache martian addresses: they should be logged (RFC1812)
1687  */
1688 martian_destination:
1689  RT_CACHE_STAT_INC(in_martian_dst);
1690 #ifdef CONFIG_IP_ROUTE_VERBOSE
1691  if (IN_DEV_LOG_MARTIANS(in_dev))
1692  net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1693  &daddr, &saddr, dev->name);
1694 #endif
1695 
1696 e_inval:
1697  err = -EINVAL;
1698  goto out;
1699 
1700 e_nobufs:
1701  err = -ENOBUFS;
1702  goto out;
1703 
1704 martian_source:
1705  err = -EINVAL;
1706 martian_source_keep_err:
1707  ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1708  goto out;
1709 }
1710 
1711 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1712  u8 tos, struct net_device *dev)
1713 {
1714  int res;
1715 
1716  rcu_read_lock();
1717 
1718  /* Multicast recognition logic is moved from route cache to here.
1719  The problem was that too many Ethernet cards have broken/missing
1720  hardware multicast filters :-( As result the host on multicasting
1721  network acquires a lot of useless route cache entries, sort of
1722  SDR messages from all the world. Now we try to get rid of them.
1723  Really, provided software IP multicast filter is organized
1724  reasonably (at least, hashed), it does not result in a slowdown
1725  comparing with route cache reject entries.
1726  Note, that multicast routers are not affected, because
1727  route cache entry is created eventually.
1728  */
1729  if (ipv4_is_multicast(daddr)) {
1730  struct in_device *in_dev = __in_dev_get_rcu(dev);
1731 
1732  if (in_dev) {
1733  int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1734  ip_hdr(skb)->protocol);
1735  if (our
1736 #ifdef CONFIG_IP_MROUTE
1737  ||
1738  (!ipv4_is_local_multicast(daddr) &&
1739  IN_DEV_MFORWARD(in_dev))
1740 #endif
1741  ) {
1742  int res = ip_route_input_mc(skb, daddr, saddr,
1743  tos, dev, our);
1744  rcu_read_unlock();
1745  return res;
1746  }
1747  }
1748  rcu_read_unlock();
1749  return -EINVAL;
1750  }
1751  res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1752  rcu_read_unlock();
1753  return res;
1754 }
1756 
1757 /* called with rcu_read_lock() */
1758 static struct rtable *__mkroute_output(const struct fib_result *res,
1759  const struct flowi4 *fl4, int orig_oif,
1760  struct net_device *dev_out,
1761  unsigned int flags)
1762 {
1763  struct fib_info *fi = res->fi;
1764  struct fib_nh_exception *fnhe;
1765  struct in_device *in_dev;
1766  u16 type = res->type;
1767  struct rtable *rth;
1768  bool do_cache;
1769 
1770  in_dev = __in_dev_get_rcu(dev_out);
1771  if (!in_dev)
1772  return ERR_PTR(-EINVAL);
1773 
1774  if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1775  if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1776  return ERR_PTR(-EINVAL);
1777 
1778  if (ipv4_is_lbcast(fl4->daddr))
1779  type = RTN_BROADCAST;
1780  else if (ipv4_is_multicast(fl4->daddr))
1781  type = RTN_MULTICAST;
1782  else if (ipv4_is_zeronet(fl4->daddr))
1783  return ERR_PTR(-EINVAL);
1784 
1785  if (dev_out->flags & IFF_LOOPBACK)
1786  flags |= RTCF_LOCAL;
1787 
1788  do_cache = true;
1789  if (type == RTN_BROADCAST) {
1790  flags |= RTCF_BROADCAST | RTCF_LOCAL;
1791  fi = NULL;
1792  } else if (type == RTN_MULTICAST) {
1793  flags |= RTCF_MULTICAST | RTCF_LOCAL;
1794  if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1795  fl4->flowi4_proto))
1796  flags &= ~RTCF_LOCAL;
1797  else
1798  do_cache = false;
1799  /* If multicast route do not exist use
1800  * default one, but do not gateway in this case.
1801  * Yes, it is hack.
1802  */
1803  if (fi && res->prefixlen < 4)
1804  fi = NULL;
1805  }
1806 
1807  fnhe = NULL;
1808  do_cache &= fi != NULL;
1809  if (do_cache) {
1810  struct rtable __rcu **prth;
1811  struct fib_nh *nh = &FIB_RES_NH(*res);
1812 
1813  fnhe = find_exception(nh, fl4->daddr);
1814  if (fnhe)
1815  prth = &fnhe->fnhe_rth;
1816  else {
1817  if (unlikely(fl4->flowi4_flags &
1819  !(nh->nh_gw &&
1820  nh->nh_scope == RT_SCOPE_LINK))) {
1821  do_cache = false;
1822  goto add;
1823  }
1824  prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1825  }
1826  rth = rcu_dereference(*prth);
1827  if (rt_cache_valid(rth)) {
1828  dst_hold(&rth->dst);
1829  return rth;
1830  }
1831  }
1832 
1833 add:
1834  rth = rt_dst_alloc(dev_out,
1835  IN_DEV_CONF_GET(in_dev, NOPOLICY),
1836  IN_DEV_CONF_GET(in_dev, NOXFRM),
1837  do_cache);
1838  if (!rth)
1839  return ERR_PTR(-ENOBUFS);
1840 
1841  rth->dst.output = ip_output;
1842 
1843  rth->rt_genid = rt_genid(dev_net(dev_out));
1844  rth->rt_flags = flags;
1845  rth->rt_type = type;
1846  rth->rt_is_input = 0;
1847  rth->rt_iif = orig_oif ? : 0;
1848  rth->rt_pmtu = 0;
1849  rth->rt_gateway = 0;
1850  rth->rt_uses_gateway = 0;
1851  INIT_LIST_HEAD(&rth->rt_uncached);
1852 
1853  RT_CACHE_STAT_INC(out_slow_tot);
1854 
1855  if (flags & RTCF_LOCAL)
1856  rth->dst.input = ip_local_deliver;
1857  if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1858  if (flags & RTCF_LOCAL &&
1859  !(dev_out->flags & IFF_LOOPBACK)) {
1860  rth->dst.output = ip_mc_output;
1861  RT_CACHE_STAT_INC(out_slow_mc);
1862  }
1863 #ifdef CONFIG_IP_MROUTE
1864  if (type == RTN_MULTICAST) {
1865  if (IN_DEV_MFORWARD(in_dev) &&
1866  !ipv4_is_local_multicast(fl4->daddr)) {
1867  rth->dst.input = ip_mr_input;
1868  rth->dst.output = ip_mc_output;
1869  }
1870  }
1871 #endif
1872  }
1873 
1874  rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1875 
1876  return rth;
1877 }
1878 
1879 /*
1880  * Major route resolver routine.
1881  */
1882 
1883 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1884 {
1885  struct net_device *dev_out = NULL;
1886  __u8 tos = RT_FL_TOS(fl4);
1887  unsigned int flags = 0;
1888  struct fib_result res;
1889  struct rtable *rth;
1890  int orig_oif;
1891 
1892  res.tclassid = 0;
1893  res.fi = NULL;
1894  res.table = NULL;
1895 
1896  orig_oif = fl4->flowi4_oif;
1897 
1898  fl4->flowi4_iif = LOOPBACK_IFINDEX;
1899  fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1900  fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1902 
1903  rcu_read_lock();
1904  if (fl4->saddr) {
1905  rth = ERR_PTR(-EINVAL);
1906  if (ipv4_is_multicast(fl4->saddr) ||
1907  ipv4_is_lbcast(fl4->saddr) ||
1908  ipv4_is_zeronet(fl4->saddr))
1909  goto out;
1910 
1911  /* I removed check for oif == dev_out->oif here.
1912  It was wrong for two reasons:
1913  1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1914  is assigned to multiple interfaces.
1915  2. Moreover, we are allowed to send packets with saddr
1916  of another iface. --ANK
1917  */
1918 
1919  if (fl4->flowi4_oif == 0 &&
1920  (ipv4_is_multicast(fl4->daddr) ||
1921  ipv4_is_lbcast(fl4->daddr))) {
1922  /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1923  dev_out = __ip_dev_find(net, fl4->saddr, false);
1924  if (dev_out == NULL)
1925  goto out;
1926 
1927  /* Special hack: user can direct multicasts
1928  and limited broadcast via necessary interface
1929  without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1930  This hack is not just for fun, it allows
1931  vic,vat and friends to work.
1932  They bind socket to loopback, set ttl to zero
1933  and expect that it will work.
1934  From the viewpoint of routing cache they are broken,
1935  because we are not allowed to build multicast path
1936  with loopback source addr (look, routing cache
1937  cannot know, that ttl is zero, so that packet
1938  will not leave this host and route is valid).
1939  Luckily, this hack is good workaround.
1940  */
1941 
1942  fl4->flowi4_oif = dev_out->ifindex;
1943  goto make_route;
1944  }
1945 
1946  if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1947  /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1948  if (!__ip_dev_find(net, fl4->saddr, false))
1949  goto out;
1950  }
1951  }
1952 
1953 
1954  if (fl4->flowi4_oif) {
1955  dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1956  rth = ERR_PTR(-ENODEV);
1957  if (dev_out == NULL)
1958  goto out;
1959 
1960  /* RACE: Check return value of inet_select_addr instead. */
1961  if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1962  rth = ERR_PTR(-ENETUNREACH);
1963  goto out;
1964  }
1965  if (ipv4_is_local_multicast(fl4->daddr) ||
1966  ipv4_is_lbcast(fl4->daddr)) {
1967  if (!fl4->saddr)
1968  fl4->saddr = inet_select_addr(dev_out, 0,
1969  RT_SCOPE_LINK);
1970  goto make_route;
1971  }
1972  if (fl4->saddr) {
1973  if (ipv4_is_multicast(fl4->daddr))
1974  fl4->saddr = inet_select_addr(dev_out, 0,
1975  fl4->flowi4_scope);
1976  else if (!fl4->daddr)
1977  fl4->saddr = inet_select_addr(dev_out, 0,
1978  RT_SCOPE_HOST);
1979  }
1980  }
1981 
1982  if (!fl4->daddr) {
1983  fl4->daddr = fl4->saddr;
1984  if (!fl4->daddr)
1985  fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1986  dev_out = net->loopback_dev;
1987  fl4->flowi4_oif = LOOPBACK_IFINDEX;
1988  res.type = RTN_LOCAL;
1989  flags |= RTCF_LOCAL;
1990  goto make_route;
1991  }
1992 
1993  if (fib_lookup(net, fl4, &res)) {
1994  res.fi = NULL;
1995  res.table = NULL;
1996  if (fl4->flowi4_oif) {
1997  /* Apparently, routing tables are wrong. Assume,
1998  that the destination is on link.
1999 
2000  WHY? DW.
2001  Because we are allowed to send to iface
2002  even if it has NO routes and NO assigned
2003  addresses. When oif is specified, routing
2004  tables are looked up with only one purpose:
2005  to catch if destination is gatewayed, rather than
2006  direct. Moreover, if MSG_DONTROUTE is set,
2007  we send packet, ignoring both routing tables
2008  and ifaddr state. --ANK
2009 
2010 
2011  We could make it even if oif is unknown,
2012  likely IPv6, but we do not.
2013  */
2014 
2015  if (fl4->saddr == 0)
2016  fl4->saddr = inet_select_addr(dev_out, 0,
2017  RT_SCOPE_LINK);
2018  res.type = RTN_UNICAST;
2019  goto make_route;
2020  }
2021  rth = ERR_PTR(-ENETUNREACH);
2022  goto out;
2023  }
2024 
2025  if (res.type == RTN_LOCAL) {
2026  if (!fl4->saddr) {
2027  if (res.fi->fib_prefsrc)
2028  fl4->saddr = res.fi->fib_prefsrc;
2029  else
2030  fl4->saddr = fl4->daddr;
2031  }
2032  dev_out = net->loopback_dev;
2033  fl4->flowi4_oif = dev_out->ifindex;
2034  flags |= RTCF_LOCAL;
2035  goto make_route;
2036  }
2037 
2038 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2039  if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2040  fib_select_multipath(&res);
2041  else
2042 #endif
2043  if (!res.prefixlen &&
2044  res.table->tb_num_default > 1 &&
2045  res.type == RTN_UNICAST && !fl4->flowi4_oif)
2046  fib_select_default(&res);
2047 
2048  if (!fl4->saddr)
2049  fl4->saddr = FIB_RES_PREFSRC(net, res);
2050 
2051  dev_out = FIB_RES_DEV(res);
2052  fl4->flowi4_oif = dev_out->ifindex;
2053 
2054 
2055 make_route:
2056  rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2057 
2058 out:
2059  rcu_read_unlock();
2060  return rth;
2061 }
2063 
2064 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2065 {
2066  return NULL;
2067 }
2068 
2069 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2070 {
2071  unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2072 
2073  return mtu ? : dst->dev->mtu;
2074 }
2075 
2076 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2077  struct sk_buff *skb, u32 mtu)
2078 {
2079 }
2080 
2081 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2082  struct sk_buff *skb)
2083 {
2084 }
2085 
2086 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2087  unsigned long old)
2088 {
2089  return NULL;
2090 }
2091 
2092 static struct dst_ops ipv4_dst_blackhole_ops = {
2093  .family = AF_INET,
2094  .protocol = cpu_to_be16(ETH_P_IP),
2095  .check = ipv4_blackhole_dst_check,
2096  .mtu = ipv4_blackhole_mtu,
2097  .default_advmss = ipv4_default_advmss,
2098  .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2099  .redirect = ipv4_rt_blackhole_redirect,
2100  .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2101  .neigh_lookup = ipv4_neigh_lookup,
2102 };
2103 
2104 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2105 {
2106  struct rtable *ort = (struct rtable *) dst_orig;
2107  struct rtable *rt;
2108 
2109  rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2110  if (rt) {
2111  struct dst_entry *new = &rt->dst;
2112 
2113  new->__use = 1;
2114  new->input = dst_discard;
2115  new->output = dst_discard;
2116 
2117  new->dev = ort->dst.dev;
2118  if (new->dev)
2119  dev_hold(new->dev);
2120 
2121  rt->rt_is_input = ort->rt_is_input;
2122  rt->rt_iif = ort->rt_iif;
2123  rt->rt_pmtu = ort->rt_pmtu;
2124 
2125  rt->rt_genid = rt_genid(net);
2126  rt->rt_flags = ort->rt_flags;
2127  rt->rt_type = ort->rt_type;
2128  rt->rt_gateway = ort->rt_gateway;
2129  rt->rt_uses_gateway = ort->rt_uses_gateway;
2130 
2131  INIT_LIST_HEAD(&rt->rt_uncached);
2132 
2133  dst_free(new);
2134  }
2135 
2136  dst_release(dst_orig);
2137 
2138  return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2139 }
2140 
2141 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2142  struct sock *sk)
2143 {
2144  struct rtable *rt = __ip_route_output_key(net, flp4);
2145 
2146  if (IS_ERR(rt))
2147  return rt;
2148 
2149  if (flp4->flowi4_proto)
2150  rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2151  flowi4_to_flowi(flp4),
2152  sk, 0);
2153 
2154  return rt;
2155 }
2157 
2158 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2159  struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2160  u32 seq, int event, int nowait, unsigned int flags)
2161 {
2162  struct rtable *rt = skb_rtable(skb);
2163  struct rtmsg *r;
2164  struct nlmsghdr *nlh;
2165  unsigned long expires = 0;
2166  u32 error;
2167  u32 metrics[RTAX_MAX];
2168 
2169  nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2170  if (nlh == NULL)
2171  return -EMSGSIZE;
2172 
2173  r = nlmsg_data(nlh);
2174  r->rtm_family = AF_INET;
2175  r->rtm_dst_len = 32;
2176  r->rtm_src_len = 0;
2177  r->rtm_tos = fl4->flowi4_tos;
2178  r->rtm_table = RT_TABLE_MAIN;
2179  if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2180  goto nla_put_failure;
2181  r->rtm_type = rt->rt_type;
2184  r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2185  if (rt->rt_flags & RTCF_NOTIFY)
2186  r->rtm_flags |= RTM_F_NOTIFY;
2187 
2188  if (nla_put_be32(skb, RTA_DST, dst))
2189  goto nla_put_failure;
2190  if (src) {
2191  r->rtm_src_len = 32;
2192  if (nla_put_be32(skb, RTA_SRC, src))
2193  goto nla_put_failure;
2194  }
2195  if (rt->dst.dev &&
2196  nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2197  goto nla_put_failure;
2198 #ifdef CONFIG_IP_ROUTE_CLASSID
2199  if (rt->dst.tclassid &&
2200  nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2201  goto nla_put_failure;
2202 #endif
2203  if (!rt_is_input_route(rt) &&
2204  fl4->saddr != src) {
2205  if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2206  goto nla_put_failure;
2207  }
2208  if (rt->rt_uses_gateway &&
2209  nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2210  goto nla_put_failure;
2211 
2212  expires = rt->dst.expires;
2213  if (expires) {
2214  unsigned long now = jiffies;
2215 
2216  if (time_before(now, expires))
2217  expires -= now;
2218  else
2219  expires = 0;
2220  }
2221 
2222  memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2223  if (rt->rt_pmtu && expires)
2224  metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2225  if (rtnetlink_put_metrics(skb, metrics) < 0)
2226  goto nla_put_failure;
2227 
2228  if (fl4->flowi4_mark &&
2229  nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2230  goto nla_put_failure;
2231 
2232  error = rt->dst.error;
2233 
2234  if (rt_is_input_route(rt)) {
2235  if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2236  goto nla_put_failure;
2237  }
2238 
2239  if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2240  goto nla_put_failure;
2241 
2242  return nlmsg_end(skb, nlh);
2243 
2244 nla_put_failure:
2245  nlmsg_cancel(skb, nlh);
2246  return -EMSGSIZE;
2247 }
2248 
2249 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2250 {
2251  struct net *net = sock_net(in_skb->sk);
2252  struct rtmsg *rtm;
2253  struct nlattr *tb[RTA_MAX+1];
2254  struct rtable *rt = NULL;
2255  struct flowi4 fl4;
2256  __be32 dst = 0;
2257  __be32 src = 0;
2258  u32 iif;
2259  int err;
2260  int mark;
2261  struct sk_buff *skb;
2262 
2263  err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2264  if (err < 0)
2265  goto errout;
2266 
2267  rtm = nlmsg_data(nlh);
2268 
2269  skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2270  if (skb == NULL) {
2271  err = -ENOBUFS;
2272  goto errout;
2273  }
2274 
2275  /* Reserve room for dummy headers, this skb can pass
2276  through good chunk of routing engine.
2277  */
2278  skb_reset_mac_header(skb);
2279  skb_reset_network_header(skb);
2280 
2281  /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2282  ip_hdr(skb)->protocol = IPPROTO_ICMP;
2283  skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2284 
2285  src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2286  dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2287  iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2288  mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2289 
2290  memset(&fl4, 0, sizeof(fl4));
2291  fl4.daddr = dst;
2292  fl4.saddr = src;
2293  fl4.flowi4_tos = rtm->rtm_tos;
2294  fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2295  fl4.flowi4_mark = mark;
2296 
2297  if (iif) {
2298  struct net_device *dev;
2299 
2300  dev = __dev_get_by_index(net, iif);
2301  if (dev == NULL) {
2302  err = -ENODEV;
2303  goto errout_free;
2304  }
2305 
2306  skb->protocol = htons(ETH_P_IP);
2307  skb->dev = dev;
2308  skb->mark = mark;
2309  local_bh_disable();
2310  err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2311  local_bh_enable();
2312 
2313  rt = skb_rtable(skb);
2314  if (err == 0 && rt->dst.error)
2315  err = -rt->dst.error;
2316  } else {
2317  rt = ip_route_output_key(net, &fl4);
2318 
2319  err = 0;
2320  if (IS_ERR(rt))
2321  err = PTR_ERR(rt);
2322  }
2323 
2324  if (err)
2325  goto errout_free;
2326 
2327  skb_dst_set(skb, &rt->dst);
2328  if (rtm->rtm_flags & RTM_F_NOTIFY)
2329  rt->rt_flags |= RTCF_NOTIFY;
2330 
2331  err = rt_fill_info(net, dst, src, &fl4, skb,
2332  NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2333  RTM_NEWROUTE, 0, 0);
2334  if (err <= 0)
2335  goto errout_free;
2336 
2337  err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2338 errout:
2339  return err;
2340 
2341 errout_free:
2342  kfree_skb(skb);
2343  goto errout;
2344 }
2345 
2346 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2347 {
2348  return skb->len;
2349 }
2350 
2351 void ip_rt_multicast_event(struct in_device *in_dev)
2352 {
2353  rt_cache_flush(dev_net(in_dev->dev));
2354 }
2355 
2356 #ifdef CONFIG_SYSCTL
2357 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2358  void __user *buffer,
2359  size_t *lenp, loff_t *ppos)
2360 {
2361  if (write) {
2362  rt_cache_flush((struct net *)__ctl->extra1);
2363  return 0;
2364  }
2365 
2366  return -EINVAL;
2367 }
2368 
2369 static ctl_table ipv4_route_table[] = {
2370  {
2371  .procname = "gc_thresh",
2372  .data = &ipv4_dst_ops.gc_thresh,
2373  .maxlen = sizeof(int),
2374  .mode = 0644,
2376  },
2377  {
2378  .procname = "max_size",
2379  .data = &ip_rt_max_size,
2380  .maxlen = sizeof(int),
2381  .mode = 0644,
2383  },
2384  {
2385  /* Deprecated. Use gc_min_interval_ms */
2386 
2387  .procname = "gc_min_interval",
2388  .data = &ip_rt_gc_min_interval,
2389  .maxlen = sizeof(int),
2390  .mode = 0644,
2392  },
2393  {
2394  .procname = "gc_min_interval_ms",
2395  .data = &ip_rt_gc_min_interval,
2396  .maxlen = sizeof(int),
2397  .mode = 0644,
2399  },
2400  {
2401  .procname = "gc_timeout",
2402  .data = &ip_rt_gc_timeout,
2403  .maxlen = sizeof(int),
2404  .mode = 0644,
2406  },
2407  {
2408  .procname = "gc_interval",
2409  .data = &ip_rt_gc_interval,
2410  .maxlen = sizeof(int),
2411  .mode = 0644,
2413  },
2414  {
2415  .procname = "redirect_load",
2416  .data = &ip_rt_redirect_load,
2417  .maxlen = sizeof(int),
2418  .mode = 0644,
2420  },
2421  {
2422  .procname = "redirect_number",
2423  .data = &ip_rt_redirect_number,
2424  .maxlen = sizeof(int),
2425  .mode = 0644,
2427  },
2428  {
2429  .procname = "redirect_silence",
2430  .data = &ip_rt_redirect_silence,
2431  .maxlen = sizeof(int),
2432  .mode = 0644,
2434  },
2435  {
2436  .procname = "error_cost",
2437  .data = &ip_rt_error_cost,
2438  .maxlen = sizeof(int),
2439  .mode = 0644,
2441  },
2442  {
2443  .procname = "error_burst",
2444  .data = &ip_rt_error_burst,
2445  .maxlen = sizeof(int),
2446  .mode = 0644,
2448  },
2449  {
2450  .procname = "gc_elasticity",
2451  .data = &ip_rt_gc_elasticity,
2452  .maxlen = sizeof(int),
2453  .mode = 0644,
2455  },
2456  {
2457  .procname = "mtu_expires",
2458  .data = &ip_rt_mtu_expires,
2459  .maxlen = sizeof(int),
2460  .mode = 0644,
2462  },
2463  {
2464  .procname = "min_pmtu",
2465  .data = &ip_rt_min_pmtu,
2466  .maxlen = sizeof(int),
2467  .mode = 0644,
2469  },
2470  {
2471  .procname = "min_adv_mss",
2472  .data = &ip_rt_min_advmss,
2473  .maxlen = sizeof(int),
2474  .mode = 0644,
2476  },
2477  { }
2478 };
2479 
2480 static struct ctl_table ipv4_route_flush_table[] = {
2481  {
2482  .procname = "flush",
2483  .maxlen = sizeof(int),
2484  .mode = 0200,
2485  .proc_handler = ipv4_sysctl_rtcache_flush,
2486  },
2487  { },
2488 };
2489 
2490 static __net_init int sysctl_route_net_init(struct net *net)
2491 {
2492  struct ctl_table *tbl;
2493 
2494  tbl = ipv4_route_flush_table;
2495  if (!net_eq(net, &init_net)) {
2496  tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2497  if (tbl == NULL)
2498  goto err_dup;
2499  }
2500  tbl[0].extra1 = net;
2501 
2502  net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2503  if (net->ipv4.route_hdr == NULL)
2504  goto err_reg;
2505  return 0;
2506 
2507 err_reg:
2508  if (tbl != ipv4_route_flush_table)
2509  kfree(tbl);
2510 err_dup:
2511  return -ENOMEM;
2512 }
2513 
2514 static __net_exit void sysctl_route_net_exit(struct net *net)
2515 {
2516  struct ctl_table *tbl;
2517 
2518  tbl = net->ipv4.route_hdr->ctl_table_arg;
2519  unregister_net_sysctl_table(net->ipv4.route_hdr);
2520  BUG_ON(tbl == ipv4_route_flush_table);
2521  kfree(tbl);
2522 }
2523 
2524 static __net_initdata struct pernet_operations sysctl_route_ops = {
2525  .init = sysctl_route_net_init,
2526  .exit = sysctl_route_net_exit,
2527 };
2528 #endif
2529 
2530 static __net_init int rt_genid_init(struct net *net)
2531 {
2532  atomic_set(&net->rt_genid, 0);
2533  get_random_bytes(&net->ipv4.dev_addr_genid,
2534  sizeof(net->ipv4.dev_addr_genid));
2535  return 0;
2536 }
2537 
2538 static __net_initdata struct pernet_operations rt_genid_ops = {
2539  .init = rt_genid_init,
2540 };
2541 
2542 static int __net_init ipv4_inetpeer_init(struct net *net)
2543 {
2544  struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2545 
2546  if (!bp)
2547  return -ENOMEM;
2548  inet_peer_base_init(bp);
2549  net->ipv4.peers = bp;
2550  return 0;
2551 }
2552 
2553 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2554 {
2555  struct inet_peer_base *bp = net->ipv4.peers;
2556 
2557  net->ipv4.peers = NULL;
2559  kfree(bp);
2560 }
2561 
2562 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2563  .init = ipv4_inetpeer_init,
2564  .exit = ipv4_inetpeer_exit,
2565 };
2566 
2567 #ifdef CONFIG_IP_ROUTE_CLASSID
2569 #endif /* CONFIG_IP_ROUTE_CLASSID */
2570 
2572 {
2573  int rc = 0;
2574 
2575 #ifdef CONFIG_IP_ROUTE_CLASSID
2576  ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2577  if (!ip_rt_acct)
2578  panic("IP: failed to allocate ip_rt_acct\n");
2579 #endif
2580 
2581  ipv4_dst_ops.kmem_cachep =
2582  kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2584 
2585  ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2586 
2587  if (dst_entries_init(&ipv4_dst_ops) < 0)
2588  panic("IP: failed to allocate ipv4_dst_ops counter\n");
2589 
2590  if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2591  panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2592 
2593  ipv4_dst_ops.gc_thresh = ~0;
2594  ip_rt_max_size = INT_MAX;
2595 
2596  devinet_init();
2597  ip_fib_init();
2598 
2599  if (ip_rt_proc_init())
2600  pr_err("Unable to create route proc files\n");
2601 #ifdef CONFIG_XFRM
2602  xfrm_init();
2603  xfrm4_init();
2604 #endif
2605  rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2606 
2607 #ifdef CONFIG_SYSCTL
2608  register_pernet_subsys(&sysctl_route_ops);
2609 #endif
2610  register_pernet_subsys(&rt_genid_ops);
2611  register_pernet_subsys(&ipv4_inetpeer_ops);
2612  return rc;
2613 }
2614 
2615 #ifdef CONFIG_SYSCTL
2616 /*
2617  * We really need to sanitize the damn ipv4 init order, then all
2618  * this nonsense will go away.
2619  */
2620 void __init ip_static_sysctl_init(void)
2621 {
2622  register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2623 }
2624 #endif