Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
ip_vs_core.c
Go to the documentation of this file.
1 /*
2  * IPVS An implementation of the IP virtual server support for the
3  * LINUX operating system. IPVS is now implemented as a module
4  * over the Netfilter framework. IPVS can be used to build a
5  * high-performance and highly available server based on a
6  * cluster of servers.
7  *
8  * Authors: Wensong Zhang <[email protected]>
9  * Peter Kese <[email protected]>
10  * Julian Anastasov <[email protected]>
11  *
12  * This program is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU General Public License
14  * as published by the Free Software Foundation; either version
15  * 2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  * Paul `Rusty' Russell properly handle non-linear skbs
23  * Harald Welte don't use nfcache
24  *
25  */
26 
27 #define KMSG_COMPONENT "IPVS"
28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29 
30 #include <linux/module.h>
31 #include <linux/kernel.h>
32 #include <linux/ip.h>
33 #include <linux/tcp.h>
34 #include <linux/sctp.h>
35 #include <linux/icmp.h>
36 #include <linux/slab.h>
37 
38 #include <net/ip.h>
39 #include <net/tcp.h>
40 #include <net/udp.h>
41 #include <net/icmp.h> /* for icmp_send */
42 #include <net/route.h>
43 #include <net/ip6_checksum.h>
44 #include <net/netns/generic.h> /* net_generic() */
45 
46 #include <linux/netfilter.h>
47 #include <linux/netfilter_ipv4.h>
48 
49 #ifdef CONFIG_IP_VS_IPV6
50 #include <net/ipv6.h>
51 #include <linux/netfilter_ipv6.h>
52 #include <net/ip6_route.h>
53 #endif
54 
55 #include <net/ip_vs.h>
56 
57 
64 #ifdef CONFIG_IP_VS_PROTO_TCP
66 #endif
68 #ifdef CONFIG_IP_VS_DEBUG
69 EXPORT_SYMBOL(ip_vs_get_debug_level);
70 #endif
71 
72 int ip_vs_net_id __read_mostly;
73 #ifdef IP_VS_GENERIC_NETNS
74 EXPORT_SYMBOL(ip_vs_net_id);
75 #endif
76 /* netns cnt used for uniqueness */
77 static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
78 
79 /* ID used in ICMP lookups */
80 #define icmp_id(icmph) (((icmph)->un).echo.id)
81 #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
82 
83 const char *ip_vs_proto_name(unsigned int proto)
84 {
85  static char buf[20];
86 
87  switch (proto) {
88  case IPPROTO_IP:
89  return "IP";
90  case IPPROTO_UDP:
91  return "UDP";
92  case IPPROTO_TCP:
93  return "TCP";
94  case IPPROTO_SCTP:
95  return "SCTP";
96  case IPPROTO_ICMP:
97  return "ICMP";
98 #ifdef CONFIG_IP_VS_IPV6
99  case IPPROTO_ICMPV6:
100  return "ICMPv6";
101 #endif
102  default:
103  sprintf(buf, "IP_%d", proto);
104  return buf;
105  }
106 }
107 
109 {
110  while (--rows >= 0)
111  INIT_LIST_HEAD(&table[rows]);
112 }
113 
114 static inline void
115 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
116 {
117  struct ip_vs_dest *dest = cp->dest;
118  struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
119 
120  if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
121  struct ip_vs_cpu_stats *s;
122 
123  s = this_cpu_ptr(dest->stats.cpustats);
124  s->ustats.inpkts++;
125  u64_stats_update_begin(&s->syncp);
126  s->ustats.inbytes += skb->len;
127  u64_stats_update_end(&s->syncp);
128 
129  s = this_cpu_ptr(dest->svc->stats.cpustats);
130  s->ustats.inpkts++;
131  u64_stats_update_begin(&s->syncp);
132  s->ustats.inbytes += skb->len;
133  u64_stats_update_end(&s->syncp);
134 
135  s = this_cpu_ptr(ipvs->tot_stats.cpustats);
136  s->ustats.inpkts++;
137  u64_stats_update_begin(&s->syncp);
138  s->ustats.inbytes += skb->len;
139  u64_stats_update_end(&s->syncp);
140  }
141 }
142 
143 
144 static inline void
145 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
146 {
147  struct ip_vs_dest *dest = cp->dest;
148  struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
149 
150  if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
151  struct ip_vs_cpu_stats *s;
152 
153  s = this_cpu_ptr(dest->stats.cpustats);
154  s->ustats.outpkts++;
155  u64_stats_update_begin(&s->syncp);
156  s->ustats.outbytes += skb->len;
157  u64_stats_update_end(&s->syncp);
158 
159  s = this_cpu_ptr(dest->svc->stats.cpustats);
160  s->ustats.outpkts++;
161  u64_stats_update_begin(&s->syncp);
162  s->ustats.outbytes += skb->len;
163  u64_stats_update_end(&s->syncp);
164 
165  s = this_cpu_ptr(ipvs->tot_stats.cpustats);
166  s->ustats.outpkts++;
167  u64_stats_update_begin(&s->syncp);
168  s->ustats.outbytes += skb->len;
169  u64_stats_update_end(&s->syncp);
170  }
171 }
172 
173 
174 static inline void
175 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
176 {
177  struct netns_ipvs *ipvs = net_ipvs(svc->net);
178  struct ip_vs_cpu_stats *s;
179 
180  s = this_cpu_ptr(cp->dest->stats.cpustats);
181  s->ustats.conns++;
182 
183  s = this_cpu_ptr(svc->stats.cpustats);
184  s->ustats.conns++;
185 
186  s = this_cpu_ptr(ipvs->tot_stats.cpustats);
187  s->ustats.conns++;
188 }
189 
190 
191 static inline void
192 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
193  const struct sk_buff *skb,
194  struct ip_vs_proto_data *pd)
195 {
196  if (likely(pd->pp->state_transition))
197  pd->pp->state_transition(cp, direction, skb, pd);
198 }
199 
200 static inline int
201 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
202  struct sk_buff *skb, int protocol,
203  const union nf_inet_addr *caddr, __be16 cport,
204  const union nf_inet_addr *vaddr, __be16 vport,
205  struct ip_vs_conn_param *p)
206 {
207  ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
208  vport, p);
209  p->pe = svc->pe;
210  if (p->pe && p->pe->fill_param)
211  return p->pe->fill_param(p, skb);
212 
213  return 0;
214 }
215 
216 /*
217  * IPVS persistent scheduling function
218  * It creates a connection entry according to its template if exists,
219  * or selects a server and creates a connection entry plus a template.
220  * Locking: we are svc user (svc->refcnt), so we hold all dests too
221  * Protocols supported: TCP, UDP
222  */
223 static struct ip_vs_conn *
224 ip_vs_sched_persist(struct ip_vs_service *svc,
225  struct sk_buff *skb,
226  __be16 src_port, __be16 dst_port, int *ignored)
227 {
228  struct ip_vs_conn *cp = NULL;
229  struct ip_vs_iphdr iph;
230  struct ip_vs_dest *dest;
231  struct ip_vs_conn *ct;
232  __be16 dport = 0; /* destination port to forward */
233  unsigned int flags;
234  struct ip_vs_conn_param param;
235  const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
236  union nf_inet_addr snet; /* source network of the client,
237  after masking */
238 
239  ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
240 
241  /* Mask saddr with the netmask to adjust template granularity */
242 #ifdef CONFIG_IP_VS_IPV6
243  if (svc->af == AF_INET6)
244  ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
245  else
246 #endif
247  snet.ip = iph.saddr.ip & svc->netmask;
248 
249  IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
250  "mnet %s\n",
251  IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
252  IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
253  IP_VS_DBG_ADDR(svc->af, &snet));
254 
255  /*
256  * As far as we know, FTP is a very complicated network protocol, and
257  * it uses control connection and data connections. For active FTP,
258  * FTP server initialize data connection to the client, its source port
259  * is often 20. For passive FTP, FTP server tells the clients the port
260  * that it passively listens to, and the client issues the data
261  * connection. In the tunneling or direct routing mode, the load
262  * balancer is on the client-to-server half of connection, the port
263  * number is unknown to the load balancer. So, a conn template like
264  * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
265  * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
266  * is created for other persistent services.
267  */
268  {
269  int protocol = iph.protocol;
270  const union nf_inet_addr *vaddr = &iph.daddr;
271  __be16 vport = 0;
272 
273  if (dst_port == svc->port) {
274  /* non-FTP template:
275  * <protocol, caddr, 0, vaddr, vport, daddr, dport>
276  * FTP template:
277  * <protocol, caddr, 0, vaddr, 0, daddr, 0>
278  */
279  if (svc->port != FTPPORT)
280  vport = dst_port;
281  } else {
282  /* Note: persistent fwmark-based services and
283  * persistent port zero service are handled here.
284  * fwmark template:
285  * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
286  * port zero template:
287  * <protocol,caddr,0,vaddr,0,daddr,0>
288  */
289  if (svc->fwmark) {
290  protocol = IPPROTO_IP;
291  vaddr = &fwmark;
292  }
293  }
294  /* return *ignored = -1 so NF_DROP can be used */
295  if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
296  vaddr, vport, &param) < 0) {
297  *ignored = -1;
298  return NULL;
299  }
300  }
301 
302  /* Check if a template already exists */
303  ct = ip_vs_ct_in_get(&param);
304  if (!ct || !ip_vs_check_template(ct)) {
305  /*
306  * No template found or the dest of the connection
307  * template is not available.
308  * return *ignored=0 i.e. ICMP and NF_DROP
309  */
310  dest = svc->scheduler->schedule(svc, skb);
311  if (!dest) {
312  IP_VS_DBG(1, "p-schedule: no dest found.\n");
313  kfree(param.pe_data);
314  *ignored = 0;
315  return NULL;
316  }
317 
318  if (dst_port == svc->port && svc->port != FTPPORT)
319  dport = dest->port;
320 
321  /* Create a template
322  * This adds param.pe_data to the template,
323  * and thus param.pe_data will be destroyed
324  * when the template expires */
325  ct = ip_vs_conn_new(&param, &dest->addr, dport,
326  IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
327  if (ct == NULL) {
328  kfree(param.pe_data);
329  *ignored = -1;
330  return NULL;
331  }
332 
333  ct->timeout = svc->timeout;
334  } else {
335  /* set destination with the found template */
336  dest = ct->dest;
337  kfree(param.pe_data);
338  }
339 
340  dport = dst_port;
341  if (dport == svc->port && dest->port)
342  dport = dest->port;
343 
344  flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
345  && iph.protocol == IPPROTO_UDP)?
347 
348  /*
349  * Create a new connection according to the template
350  */
351  ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
352  src_port, &iph.daddr, dst_port, &param);
353 
354  cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
355  if (cp == NULL) {
356  ip_vs_conn_put(ct);
357  *ignored = -1;
358  return NULL;
359  }
360 
361  /*
362  * Add its control
363  */
364  ip_vs_control_add(cp, ct);
365  ip_vs_conn_put(ct);
366 
367  ip_vs_conn_stats(cp, svc);
368  return cp;
369 }
370 
371 
372 /*
373  * IPVS main scheduling function
374  * It selects a server according to the virtual service, and
375  * creates a connection entry.
376  * Protocols supported: TCP, UDP
377  *
378  * Usage of *ignored
379  *
380  * 1 : protocol tried to schedule (eg. on SYN), found svc but the
381  * svc/scheduler decides that this packet should be accepted with
382  * NF_ACCEPT because it must not be scheduled.
383  *
384  * 0 : scheduler can not find destination, so try bypass or
385  * return ICMP and then NF_DROP (ip_vs_leave).
386  *
387  * -1 : scheduler tried to schedule but fatal error occurred, eg.
388  * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
389  * failure such as missing Call-ID, ENOMEM on skb_linearize
390  * or pe_data. In this case we should return NF_DROP without
391  * any attempts to send ICMP with ip_vs_leave.
392  */
393 struct ip_vs_conn *
394 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
395  struct ip_vs_proto_data *pd, int *ignored)
396 {
397  struct ip_vs_protocol *pp = pd->pp;
398  struct ip_vs_conn *cp = NULL;
399  struct ip_vs_iphdr iph;
400  struct ip_vs_dest *dest;
401  __be16 _ports[2], *pptr;
402  unsigned int flags;
403 
404  *ignored = 1;
405  ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
406  pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
407  if (pptr == NULL)
408  return NULL;
409 
410  /*
411  * FTPDATA needs this check when using local real server.
412  * Never schedule Active FTPDATA connections from real server.
413  * For LVS-NAT they must be already created. For other methods
414  * with persistence the connection is created on SYN+ACK.
415  */
416  if (pptr[0] == FTPDATA) {
417  IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
418  "Not scheduling FTPDATA");
419  return NULL;
420  }
421 
422  /*
423  * Do not schedule replies from local real server.
424  */
425  if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
426  (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
427  IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
428  "Not scheduling reply for existing connection");
429  __ip_vs_conn_put(cp);
430  return NULL;
431  }
432 
433  /*
434  * Persistent service
435  */
436  if (svc->flags & IP_VS_SVC_F_PERSISTENT)
437  return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
438 
439  *ignored = 0;
440 
441  /*
442  * Non-persistent service
443  */
444  if (!svc->fwmark && pptr[1] != svc->port) {
445  if (!svc->port)
446  pr_err("Schedule: port zero only supported "
447  "in persistent services, "
448  "check your ipvs configuration\n");
449  return NULL;
450  }
451 
452  dest = svc->scheduler->schedule(svc, skb);
453  if (dest == NULL) {
454  IP_VS_DBG(1, "Schedule: no dest found.\n");
455  return NULL;
456  }
457 
458  flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
459  && iph.protocol == IPPROTO_UDP)?
461 
462  /*
463  * Create a connection entry.
464  */
465  {
466  struct ip_vs_conn_param p;
467 
468  ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
469  &iph.saddr, pptr[0], &iph.daddr, pptr[1],
470  &p);
471  cp = ip_vs_conn_new(&p, &dest->addr,
472  dest->port ? dest->port : pptr[1],
473  flags, dest, skb->mark);
474  if (!cp) {
475  *ignored = -1;
476  return NULL;
477  }
478  }
479 
480  IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
481  "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
482  ip_vs_fwd_tag(cp),
483  IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
484  IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
485  IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
486  cp->flags, atomic_read(&cp->refcnt));
487 
488  ip_vs_conn_stats(cp, svc);
489  return cp;
490 }
491 
492 
493 /*
494  * Pass or drop the packet.
495  * Called by ip_vs_in, when the virtual service is available but
496  * no destination is available for a new connection.
497  */
498 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
499  struct ip_vs_proto_data *pd)
500 {
501  __be16 _ports[2], *pptr;
502  struct ip_vs_iphdr iph;
503 #ifdef CONFIG_SYSCTL
504  struct net *net;
505  struct netns_ipvs *ipvs;
506  int unicast;
507 #endif
508 
509  ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
510 
511  pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
512  if (pptr == NULL) {
513  ip_vs_service_put(svc);
514  return NF_DROP;
515  }
516 
517 #ifdef CONFIG_SYSCTL
518  net = skb_net(skb);
519 
520 #ifdef CONFIG_IP_VS_IPV6
521  if (svc->af == AF_INET6)
522  unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
523  else
524 #endif
525  unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
526 
527  /* if it is fwmark-based service, the cache_bypass sysctl is up
528  and the destination is a non-local unicast, then create
529  a cache_bypass connection entry */
530  ipvs = net_ipvs(net);
531  if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
532  int ret;
533  struct ip_vs_conn *cp;
534  unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
535  iph.protocol == IPPROTO_UDP)?
537  union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
538 
539  ip_vs_service_put(svc);
540 
541  /* create a new connection entry */
542  IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
543  {
544  struct ip_vs_conn_param p;
545  ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
546  &iph.saddr, pptr[0],
547  &iph.daddr, pptr[1], &p);
548  cp = ip_vs_conn_new(&p, &daddr, 0,
549  IP_VS_CONN_F_BYPASS | flags,
550  NULL, skb->mark);
551  if (!cp)
552  return NF_DROP;
553  }
554 
555  /* statistics */
556  ip_vs_in_stats(cp, skb);
557 
558  /* set state */
559  ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
560 
561  /* transmit the first SYN packet */
562  ret = cp->packet_xmit(skb, cp, pd->pp);
563  /* do not touch skb anymore */
564 
565  atomic_inc(&cp->in_pkts);
566  ip_vs_conn_put(cp);
567  return ret;
568  }
569 #endif
570 
571  /*
572  * When the virtual ftp service is presented, packets destined
573  * for other services on the VIP may get here (except services
574  * listed in the ipvs table), pass the packets, because it is
575  * not ipvs job to decide to drop the packets.
576  */
577  if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
578  ip_vs_service_put(svc);
579  return NF_ACCEPT;
580  }
581 
582  ip_vs_service_put(svc);
583 
584  /*
585  * Notify the client that the destination is unreachable, and
586  * release the socket buffer.
587  * Since it is in IP layer, the TCP socket is not actually
588  * created, the TCP RST packet cannot be sent, instead that
589  * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
590  */
591 #ifdef CONFIG_IP_VS_IPV6
592  if (svc->af == AF_INET6) {
593  if (!skb->dev) {
594  struct net *net = dev_net(skb_dst(skb)->dev);
595 
596  skb->dev = net->loopback_dev;
597  }
599  } else
600 #endif
602 
603  return NF_DROP;
604 }
605 
606 #ifdef CONFIG_SYSCTL
607 
608 static int sysctl_snat_reroute(struct sk_buff *skb)
609 {
610  struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
611  return ipvs->sysctl_snat_reroute;
612 }
613 
614 static int sysctl_nat_icmp_send(struct net *net)
615 {
616  struct netns_ipvs *ipvs = net_ipvs(net);
617  return ipvs->sysctl_nat_icmp_send;
618 }
619 
620 static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
621 {
622  return ipvs->sysctl_expire_nodest_conn;
623 }
624 
625 #else
626 
627 static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
628 static int sysctl_nat_icmp_send(struct net *net) { return 0; }
629 static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
630 
631 #endif
632 
634 {
635  return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
636 }
637 
638 static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
639 {
640  if (NF_INET_LOCAL_IN == hooknum)
641  return IP_DEFRAG_VS_IN;
642  if (NF_INET_FORWARD == hooknum)
643  return IP_DEFRAG_VS_FWD;
644  return IP_DEFRAG_VS_OUT;
645 }
646 
647 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
648 {
649  int err = ip_defrag(skb, user);
650 
651  if (!err)
652  ip_send_check(ip_hdr(skb));
653 
654  return err;
655 }
656 
657 #ifdef CONFIG_IP_VS_IPV6
658 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
659 {
660  /* TODO IPv6: Find out what to do here for IPv6 */
661  return 0;
662 }
663 #endif
664 
665 static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
666 {
667 #ifdef CONFIG_IP_VS_IPV6
668  if (af == AF_INET6) {
669  if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0)
670  return 1;
671  } else
672 #endif
673  if ((sysctl_snat_reroute(skb) ||
674  skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
675  ip_route_me_harder(skb, RTN_LOCAL) != 0)
676  return 1;
677 
678  return 0;
679 }
680 
681 /*
682  * Packet has been made sufficiently writable in caller
683  * - inout: 1=in->out, 0=out->in
684  */
685 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
686  struct ip_vs_conn *cp, int inout)
687 {
688  struct iphdr *iph = ip_hdr(skb);
689  unsigned int icmp_offset = iph->ihl*4;
690  struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
691  icmp_offset);
692  struct iphdr *ciph = (struct iphdr *)(icmph + 1);
693 
694  if (inout) {
695  iph->saddr = cp->vaddr.ip;
696  ip_send_check(iph);
697  ciph->daddr = cp->vaddr.ip;
698  ip_send_check(ciph);
699  } else {
700  iph->daddr = cp->daddr.ip;
701  ip_send_check(iph);
702  ciph->saddr = cp->daddr.ip;
703  ip_send_check(ciph);
704  }
705 
706  /* the TCP/UDP/SCTP port */
707  if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
708  IPPROTO_SCTP == ciph->protocol) {
709  __be16 *ports = (void *)ciph + ciph->ihl*4;
710 
711  if (inout)
712  ports[1] = cp->vport;
713  else
714  ports[0] = cp->dport;
715  }
716 
717  /* And finally the ICMP checksum */
718  icmph->checksum = 0;
719  icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
721 
722  if (inout)
723  IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
724  "Forwarding altered outgoing ICMP");
725  else
726  IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
727  "Forwarding altered incoming ICMP");
728 }
729 
730 #ifdef CONFIG_IP_VS_IPV6
731 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
732  struct ip_vs_conn *cp, int inout)
733 {
734  struct ipv6hdr *iph = ipv6_hdr(skb);
735  unsigned int icmp_offset = sizeof(struct ipv6hdr);
736  struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) +
737  icmp_offset);
738  struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1);
739 
740  if (inout) {
741  iph->saddr = cp->vaddr.in6;
742  ciph->daddr = cp->vaddr.in6;
743  } else {
744  iph->daddr = cp->daddr.in6;
745  ciph->saddr = cp->daddr.in6;
746  }
747 
748  /* the TCP/UDP/SCTP port */
749  if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
750  IPPROTO_SCTP == ciph->nexthdr) {
751  __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
752 
753  if (inout)
754  ports[1] = cp->vport;
755  else
756  ports[0] = cp->dport;
757  }
758 
759  /* And finally the ICMP checksum */
760  icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
761  skb->len - icmp_offset,
762  IPPROTO_ICMPV6, 0);
763  skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
764  skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
766 
767  if (inout)
768  IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
769  (void *)ciph - (void *)iph,
770  "Forwarding altered outgoing ICMPv6");
771  else
772  IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
773  (void *)ciph - (void *)iph,
774  "Forwarding altered incoming ICMPv6");
775 }
776 #endif
777 
778 /* Handle relevant response ICMP messages - forward to the right
779  * destination host.
780  */
781 static int handle_response_icmp(int af, struct sk_buff *skb,
782  union nf_inet_addr *snet,
783  __u8 protocol, struct ip_vs_conn *cp,
784  struct ip_vs_protocol *pp,
785  unsigned int offset, unsigned int ihl)
786 {
787  unsigned int verdict = NF_DROP;
788 
789  if (IP_VS_FWD_METHOD(cp) != 0) {
790  pr_err("shouldn't reach here, because the box is on the "
791  "half connection in the tun/dr module.\n");
792  }
793 
794  /* Ensure the checksum is correct */
795  if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
796  /* Failed checksum! */
797  IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
798  IP_VS_DBG_ADDR(af, snet));
799  goto out;
800  }
801 
802  if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
803  IPPROTO_SCTP == protocol)
804  offset += 2 * sizeof(__u16);
805  if (!skb_make_writable(skb, offset))
806  goto out;
807 
808 #ifdef CONFIG_IP_VS_IPV6
809  if (af == AF_INET6)
810  ip_vs_nat_icmp_v6(skb, pp, cp, 1);
811  else
812 #endif
813  ip_vs_nat_icmp(skb, pp, cp, 1);
814 
815  if (ip_vs_route_me_harder(af, skb))
816  goto out;
817 
818  /* do the statistics and put it back */
819  ip_vs_out_stats(cp, skb);
820 
821  skb->ipvs_property = 1;
822  if (!(cp->flags & IP_VS_CONN_F_NFCT))
823  ip_vs_notrack(skb);
824  else
825  ip_vs_update_conntrack(skb, cp, 0);
826  verdict = NF_ACCEPT;
827 
828 out:
829  __ip_vs_conn_put(cp);
830 
831  return verdict;
832 }
833 
834 /*
835  * Handle ICMP messages in the inside-to-outside direction (outgoing).
836  * Find any that might be relevant, check against existing connections.
837  * Currently handles error types - unreachable, quench, ttl exceeded.
838  */
839 static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
840  unsigned int hooknum)
841 {
842  struct iphdr *iph;
843  struct icmphdr _icmph, *ic;
844  struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
845  struct ip_vs_iphdr ciph;
846  struct ip_vs_conn *cp;
847  struct ip_vs_protocol *pp;
848  unsigned int offset, ihl;
849  union nf_inet_addr snet;
850 
851  *related = 1;
852 
853  /* reassemble IP fragments */
854  if (ip_is_fragment(ip_hdr(skb))) {
855  if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
856  return NF_STOLEN;
857  }
858 
859  iph = ip_hdr(skb);
860  offset = ihl = iph->ihl * 4;
861  ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
862  if (ic == NULL)
863  return NF_DROP;
864 
865  IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
866  ic->type, ntohs(icmp_id(ic)),
867  &iph->saddr, &iph->daddr);
868 
869  /*
870  * Work through seeing if this is for us.
871  * These checks are supposed to be in an order that means easy
872  * things are checked first to speed up processing.... however
873  * this means that some packets will manage to get a long way
874  * down this stack and then be rejected, but that's life.
875  */
876  if ((ic->type != ICMP_DEST_UNREACH) &&
877  (ic->type != ICMP_SOURCE_QUENCH) &&
878  (ic->type != ICMP_TIME_EXCEEDED)) {
879  *related = 0;
880  return NF_ACCEPT;
881  }
882 
883  /* Now find the contained IP header */
884  offset += sizeof(_icmph);
885  cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
886  if (cih == NULL)
887  return NF_ACCEPT; /* The packet looks wrong, ignore */
888 
889  pp = ip_vs_proto_get(cih->protocol);
890  if (!pp)
891  return NF_ACCEPT;
892 
893  /* Is the embedded protocol header present? */
894  if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
895  pp->dont_defrag))
896  return NF_ACCEPT;
897 
898  IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
899  "Checking outgoing ICMP for");
900 
901  offset += cih->ihl * 4;
902 
903  ip_vs_fill_iphdr(AF_INET, cih, &ciph);
904  /* The embedded headers contain source and dest in reverse order */
905  cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
906  if (!cp)
907  return NF_ACCEPT;
908 
909  snet.ip = iph->saddr;
910  return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
911  pp, offset, ihl);
912 }
913 
914 #ifdef CONFIG_IP_VS_IPV6
915 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
916  unsigned int hooknum)
917 {
918  struct ipv6hdr *iph;
919  struct icmp6hdr _icmph, *ic;
920  struct ipv6hdr _ciph, *cih; /* The ip header contained
921  within the ICMP */
922  struct ip_vs_iphdr ciph;
923  struct ip_vs_conn *cp;
924  struct ip_vs_protocol *pp;
925  unsigned int offset;
926  union nf_inet_addr snet;
927 
928  *related = 1;
929 
930  /* reassemble IP fragments */
931  if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
932  if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
933  return NF_STOLEN;
934  }
935 
936  iph = ipv6_hdr(skb);
937  offset = sizeof(struct ipv6hdr);
938  ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
939  if (ic == NULL)
940  return NF_DROP;
941 
942  IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
943  ic->icmp6_type, ntohs(icmpv6_id(ic)),
944  &iph->saddr, &iph->daddr);
945 
946  /*
947  * Work through seeing if this is for us.
948  * These checks are supposed to be in an order that means easy
949  * things are checked first to speed up processing.... however
950  * this means that some packets will manage to get a long way
951  * down this stack and then be rejected, but that's life.
952  */
953  if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
954  (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
955  (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
956  *related = 0;
957  return NF_ACCEPT;
958  }
959 
960  /* Now find the contained IP header */
961  offset += sizeof(_icmph);
962  cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
963  if (cih == NULL)
964  return NF_ACCEPT; /* The packet looks wrong, ignore */
965 
966  pp = ip_vs_proto_get(cih->nexthdr);
967  if (!pp)
968  return NF_ACCEPT;
969 
970  /* Is the embedded protocol header present? */
971  /* TODO: we don't support fragmentation at the moment anyways */
972  if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
973  return NF_ACCEPT;
974 
975  IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
976  "Checking outgoing ICMPv6 for");
977 
978  offset += sizeof(struct ipv6hdr);
979 
980  ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
981  /* The embedded headers contain source and dest in reverse order */
982  cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
983  if (!cp)
984  return NF_ACCEPT;
985 
986  snet.in6 = iph->saddr;
987  return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
988  pp, offset, sizeof(struct ipv6hdr));
989 }
990 #endif
991 
992 /*
993  * Check if sctp chunc is ABORT chunk
994  */
995 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
996 {
997  sctp_chunkhdr_t *sch, schunk;
998  sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
999  sizeof(schunk), &schunk);
1000  if (sch == NULL)
1001  return 0;
1002  if (sch->type == SCTP_CID_ABORT)
1003  return 1;
1004  return 0;
1005 }
1006 
1007 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
1008 {
1009  struct tcphdr _tcph, *th;
1010 
1011  th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
1012  if (th == NULL)
1013  return 0;
1014  return th->rst;
1015 }
1016 
1017 /* Handle response packets: rewrite addresses and send away...
1018  */
1019 static unsigned int
1020 handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
1021  struct ip_vs_conn *cp, int ihl)
1022 {
1023  struct ip_vs_protocol *pp = pd->pp;
1024 
1025  IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
1026 
1027  if (!skb_make_writable(skb, ihl))
1028  goto drop;
1029 
1030  /* mangle the packet */
1031  if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
1032  goto drop;
1033 
1034 #ifdef CONFIG_IP_VS_IPV6
1035  if (af == AF_INET6)
1036  ipv6_hdr(skb)->saddr = cp->vaddr.in6;
1037  else
1038 #endif
1039  {
1040  ip_hdr(skb)->saddr = cp->vaddr.ip;
1041  ip_send_check(ip_hdr(skb));
1042  }
1043 
1044  /*
1045  * nf_iterate does not expect change in the skb->dst->dev.
1046  * It looks like it is not fatal to enable this code for hooks
1047  * where our handlers are at the end of the chain list and
1048  * when all next handlers use skb->dst->dev and not outdev.
1049  * It will definitely route properly the inout NAT traffic
1050  * when multiple paths are used.
1051  */
1052 
1053  /* For policy routing, packets originating from this
1054  * machine itself may be routed differently to packets
1055  * passing through. We want this packet to be routed as
1056  * if it came from this machine itself. So re-compute
1057  * the routing information.
1058  */
1059  if (ip_vs_route_me_harder(af, skb))
1060  goto drop;
1061 
1062  IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
1063 
1064  ip_vs_out_stats(cp, skb);
1065  ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
1066  skb->ipvs_property = 1;
1067  if (!(cp->flags & IP_VS_CONN_F_NFCT))
1068  ip_vs_notrack(skb);
1069  else
1070  ip_vs_update_conntrack(skb, cp, 0);
1071  ip_vs_conn_put(cp);
1072 
1073  LeaveFunction(11);
1074  return NF_ACCEPT;
1075 
1076 drop:
1077  ip_vs_conn_put(cp);
1078  kfree_skb(skb);
1079  LeaveFunction(11);
1080  return NF_STOLEN;
1081 }
1082 
1083 /*
1084  * Check if outgoing packet belongs to the established ip_vs_conn.
1085  */
1086 static unsigned int
1087 ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1088 {
1089  struct net *net = NULL;
1090  struct ip_vs_iphdr iph;
1091  struct ip_vs_protocol *pp;
1092  struct ip_vs_proto_data *pd;
1093  struct ip_vs_conn *cp;
1094 
1095  EnterFunction(11);
1096 
1097  /* Already marked as IPVS request or reply? */
1098  if (skb->ipvs_property)
1099  return NF_ACCEPT;
1100 
1101  /* Bad... Do not break raw sockets */
1102  if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1103  af == AF_INET)) {
1104  struct sock *sk = skb->sk;
1105  struct inet_sock *inet = inet_sk(skb->sk);
1106 
1107  if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1108  return NF_ACCEPT;
1109  }
1110 
1111  if (unlikely(!skb_dst(skb)))
1112  return NF_ACCEPT;
1113 
1114  net = skb_net(skb);
1115  if (!net_ipvs(net)->enable)
1116  return NF_ACCEPT;
1117 
1118  ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1119 #ifdef CONFIG_IP_VS_IPV6
1120  if (af == AF_INET6) {
1121  if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1122  int related;
1123  int verdict = ip_vs_out_icmp_v6(skb, &related,
1124  hooknum);
1125 
1126  if (related)
1127  return verdict;
1128  ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1129  }
1130  } else
1131 #endif
1132  if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1133  int related;
1134  int verdict = ip_vs_out_icmp(skb, &related, hooknum);
1135 
1136  if (related)
1137  return verdict;
1138  ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1139  }
1140 
1141  pd = ip_vs_proto_data_get(net, iph.protocol);
1142  if (unlikely(!pd))
1143  return NF_ACCEPT;
1144  pp = pd->pp;
1145 
1146  /* reassemble IP fragments */
1147 #ifdef CONFIG_IP_VS_IPV6
1148  if (af == AF_INET6) {
1149  if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1150  if (ip_vs_gather_frags_v6(skb,
1151  ip_vs_defrag_user(hooknum)))
1152  return NF_STOLEN;
1153  }
1154 
1155  ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1156  } else
1157 #endif
1158  if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
1159  if (ip_vs_gather_frags(skb,
1160  ip_vs_defrag_user(hooknum)))
1161  return NF_STOLEN;
1162 
1163  ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1164  }
1165 
1166  /*
1167  * Check if the packet belongs to an existing entry
1168  */
1169  cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
1170 
1171  if (likely(cp))
1172  return handle_response(af, skb, pd, cp, iph.len);
1173  if (sysctl_nat_icmp_send(net) &&
1174  (pp->protocol == IPPROTO_TCP ||
1175  pp->protocol == IPPROTO_UDP ||
1176  pp->protocol == IPPROTO_SCTP)) {
1177  __be16 _ports[2], *pptr;
1178 
1179  pptr = skb_header_pointer(skb, iph.len,
1180  sizeof(_ports), _ports);
1181  if (pptr == NULL)
1182  return NF_ACCEPT; /* Not for me */
1183  if (ip_vs_lookup_real_service(net, af, iph.protocol,
1184  &iph.saddr,
1185  pptr[0])) {
1186  /*
1187  * Notify the real server: there is no
1188  * existing entry if it is not RST
1189  * packet or not TCP packet.
1190  */
1191  if ((iph.protocol != IPPROTO_TCP &&
1192  iph.protocol != IPPROTO_SCTP)
1193  || ((iph.protocol == IPPROTO_TCP
1194  && !is_tcp_reset(skb, iph.len))
1195  || (iph.protocol == IPPROTO_SCTP
1196  && !is_sctp_abort(skb,
1197  iph.len)))) {
1198 #ifdef CONFIG_IP_VS_IPV6
1199  if (af == AF_INET6) {
1200  struct net *net =
1201  dev_net(skb_dst(skb)->dev);
1202 
1203  if (!skb->dev)
1204  skb->dev = net->loopback_dev;
1205  icmpv6_send(skb,
1208  0);
1209  } else
1210 #endif
1211  icmp_send(skb,
1213  ICMP_PORT_UNREACH, 0);
1214  return NF_DROP;
1215  }
1216  }
1217  }
1218  IP_VS_DBG_PKT(12, af, pp, skb, 0,
1219  "ip_vs_out: packet continues traversal as normal");
1220  return NF_ACCEPT;
1221 }
1222 
1223 /*
1224  * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1225  * used only for VS/NAT.
1226  * Check if packet is reply for established ip_vs_conn.
1227  */
1228 static unsigned int
1229 ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
1230  const struct net_device *in, const struct net_device *out,
1231  int (*okfn)(struct sk_buff *))
1232 {
1233  return ip_vs_out(hooknum, skb, AF_INET);
1234 }
1235 
1236 /*
1237  * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1238  * Check if packet is reply for established ip_vs_conn.
1239  */
1240 static unsigned int
1241 ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
1242  const struct net_device *in, const struct net_device *out,
1243  int (*okfn)(struct sk_buff *))
1244 {
1245  unsigned int verdict;
1246 
1247  /* Disable BH in LOCAL_OUT until all places are fixed */
1248  local_bh_disable();
1249  verdict = ip_vs_out(hooknum, skb, AF_INET);
1250  local_bh_enable();
1251  return verdict;
1252 }
1253 
1254 #ifdef CONFIG_IP_VS_IPV6
1255 
1256 /*
1257  * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1258  * used only for VS/NAT.
1259  * Check if packet is reply for established ip_vs_conn.
1260  */
1261 static unsigned int
1262 ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
1263  const struct net_device *in, const struct net_device *out,
1264  int (*okfn)(struct sk_buff *))
1265 {
1266  return ip_vs_out(hooknum, skb, AF_INET6);
1267 }
1268 
1269 /*
1270  * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1271  * Check if packet is reply for established ip_vs_conn.
1272  */
1273 static unsigned int
1274 ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
1275  const struct net_device *in, const struct net_device *out,
1276  int (*okfn)(struct sk_buff *))
1277 {
1278  unsigned int verdict;
1279 
1280  /* Disable BH in LOCAL_OUT until all places are fixed */
1281  local_bh_disable();
1282  verdict = ip_vs_out(hooknum, skb, AF_INET6);
1283  local_bh_enable();
1284  return verdict;
1285 }
1286 
1287 #endif
1288 
1289 /*
1290  * Handle ICMP messages in the outside-to-inside direction (incoming).
1291  * Find any that might be relevant, check against existing connections,
1292  * forward to the right destination host if relevant.
1293  * Currently handles error types - unreachable, quench, ttl exceeded.
1294  */
1295 static int
1296 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1297 {
1298  struct net *net = NULL;
1299  struct iphdr *iph;
1300  struct icmphdr _icmph, *ic;
1301  struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
1302  struct ip_vs_iphdr ciph;
1303  struct ip_vs_conn *cp;
1304  struct ip_vs_protocol *pp;
1305  struct ip_vs_proto_data *pd;
1306  unsigned int offset, offset2, ihl, verdict;
1307  bool ipip;
1308 
1309  *related = 1;
1310 
1311  /* reassemble IP fragments */
1312  if (ip_is_fragment(ip_hdr(skb))) {
1313  if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1314  return NF_STOLEN;
1315  }
1316 
1317  iph = ip_hdr(skb);
1318  offset = ihl = iph->ihl * 4;
1319  ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1320  if (ic == NULL)
1321  return NF_DROP;
1322 
1323  IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1324  ic->type, ntohs(icmp_id(ic)),
1325  &iph->saddr, &iph->daddr);
1326 
1327  /*
1328  * Work through seeing if this is for us.
1329  * These checks are supposed to be in an order that means easy
1330  * things are checked first to speed up processing.... however
1331  * this means that some packets will manage to get a long way
1332  * down this stack and then be rejected, but that's life.
1333  */
1334  if ((ic->type != ICMP_DEST_UNREACH) &&
1335  (ic->type != ICMP_SOURCE_QUENCH) &&
1336  (ic->type != ICMP_TIME_EXCEEDED)) {
1337  *related = 0;
1338  return NF_ACCEPT;
1339  }
1340 
1341  /* Now find the contained IP header */
1342  offset += sizeof(_icmph);
1343  cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1344  if (cih == NULL)
1345  return NF_ACCEPT; /* The packet looks wrong, ignore */
1346 
1347  net = skb_net(skb);
1348 
1349  /* Special case for errors for IPIP packets */
1350  ipip = false;
1351  if (cih->protocol == IPPROTO_IPIP) {
1352  if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1353  return NF_ACCEPT;
1354  /* Error for our IPIP must arrive at LOCAL_IN */
1355  if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
1356  return NF_ACCEPT;
1357  offset += cih->ihl * 4;
1358  cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1359  if (cih == NULL)
1360  return NF_ACCEPT; /* The packet looks wrong, ignore */
1361  ipip = true;
1362  }
1363 
1364  pd = ip_vs_proto_data_get(net, cih->protocol);
1365  if (!pd)
1366  return NF_ACCEPT;
1367  pp = pd->pp;
1368 
1369  /* Is the embedded protocol header present? */
1370  if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1371  pp->dont_defrag))
1372  return NF_ACCEPT;
1373 
1374  IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1375  "Checking incoming ICMP for");
1376 
1377  offset2 = offset;
1378  offset += cih->ihl * 4;
1379 
1380  ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1381  /* The embedded headers contain source and dest in reverse order.
1382  * For IPIP this is error for request, not for reply.
1383  */
1384  cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, ipip ? 0 : 1);
1385  if (!cp)
1386  return NF_ACCEPT;
1387 
1388  verdict = NF_DROP;
1389 
1390  /* Ensure the checksum is correct */
1391  if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1392  /* Failed checksum! */
1393  IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1394  &iph->saddr);
1395  goto out;
1396  }
1397 
1398  if (ipip) {
1399  __be32 info = ic->un.gateway;
1400 
1401  /* Update the MTU */
1402  if (ic->type == ICMP_DEST_UNREACH &&
1403  ic->code == ICMP_FRAG_NEEDED) {
1404  struct ip_vs_dest *dest = cp->dest;
1405  u32 mtu = ntohs(ic->un.frag.mtu);
1406 
1407  /* Strip outer IP and ICMP, go to IPIP header */
1408  __skb_pull(skb, ihl + sizeof(_icmph));
1409  offset2 -= ihl + sizeof(_icmph);
1410  skb_reset_network_header(skb);
1411  IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
1412  &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
1413  rcu_read_lock();
1414  ipv4_update_pmtu(skb, dev_net(skb->dev),
1415  mtu, 0, 0, 0, 0);
1416  rcu_read_unlock();
1417  /* Client uses PMTUD? */
1418  if (!(cih->frag_off & htons(IP_DF)))
1419  goto ignore_ipip;
1420  /* Prefer the resulting PMTU */
1421  if (dest) {
1422  spin_lock(&dest->dst_lock);
1423  if (dest->dst_cache)
1424  mtu = dst_mtu(dest->dst_cache);
1425  spin_unlock(&dest->dst_lock);
1426  }
1427  if (mtu > 68 + sizeof(struct iphdr))
1428  mtu -= sizeof(struct iphdr);
1429  info = htonl(mtu);
1430  }
1431  /* Strip outer IP, ICMP and IPIP, go to IP header of
1432  * original request.
1433  */
1434  __skb_pull(skb, offset2);
1435  skb_reset_network_header(skb);
1436  IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
1437  &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1438  ic->type, ic->code, ntohl(info));
1439  icmp_send(skb, ic->type, ic->code, info);
1440  /* ICMP can be shorter but anyways, account it */
1441  ip_vs_out_stats(cp, skb);
1442 
1443 ignore_ipip:
1444  consume_skb(skb);
1445  verdict = NF_STOLEN;
1446  goto out;
1447  }
1448 
1449  /* do the statistics and put it back */
1450  ip_vs_in_stats(cp, skb);
1451  if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1452  offset += 2 * sizeof(__u16);
1453  verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum);
1454 
1455 out:
1456  __ip_vs_conn_put(cp);
1457 
1458  return verdict;
1459 }
1460 
1461 #ifdef CONFIG_IP_VS_IPV6
1462 static int
1463 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1464 {
1465  struct net *net = NULL;
1466  struct ipv6hdr *iph;
1467  struct icmp6hdr _icmph, *ic;
1468  struct ipv6hdr _ciph, *cih; /* The ip header contained
1469  within the ICMP */
1470  struct ip_vs_iphdr ciph;
1471  struct ip_vs_conn *cp;
1472  struct ip_vs_protocol *pp;
1473  struct ip_vs_proto_data *pd;
1474  unsigned int offset, verdict;
1475 
1476  *related = 1;
1477 
1478  /* reassemble IP fragments */
1479  if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1480  if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
1481  return NF_STOLEN;
1482  }
1483 
1484  iph = ipv6_hdr(skb);
1485  offset = sizeof(struct ipv6hdr);
1486  ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1487  if (ic == NULL)
1488  return NF_DROP;
1489 
1490  IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1491  ic->icmp6_type, ntohs(icmpv6_id(ic)),
1492  &iph->saddr, &iph->daddr);
1493 
1494  /*
1495  * Work through seeing if this is for us.
1496  * These checks are supposed to be in an order that means easy
1497  * things are checked first to speed up processing.... however
1498  * this means that some packets will manage to get a long way
1499  * down this stack and then be rejected, but that's life.
1500  */
1501  if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1502  (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1503  (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1504  *related = 0;
1505  return NF_ACCEPT;
1506  }
1507 
1508  /* Now find the contained IP header */
1509  offset += sizeof(_icmph);
1510  cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1511  if (cih == NULL)
1512  return NF_ACCEPT; /* The packet looks wrong, ignore */
1513 
1514  net = skb_net(skb);
1515  pd = ip_vs_proto_data_get(net, cih->nexthdr);
1516  if (!pd)
1517  return NF_ACCEPT;
1518  pp = pd->pp;
1519 
1520  /* Is the embedded protocol header present? */
1521  /* TODO: we don't support fragmentation at the moment anyways */
1522  if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1523  return NF_ACCEPT;
1524 
1525  IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
1526  "Checking incoming ICMPv6 for");
1527 
1528  offset += sizeof(struct ipv6hdr);
1529 
1530  ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1531  /* The embedded headers contain source and dest in reverse order */
1532  cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
1533  if (!cp)
1534  return NF_ACCEPT;
1535 
1536  /* do the statistics and put it back */
1537  ip_vs_in_stats(cp, skb);
1538  if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1539  IPPROTO_SCTP == cih->nexthdr)
1540  offset += 2 * sizeof(__u16);
1541  verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum);
1542 
1543  __ip_vs_conn_put(cp);
1544 
1545  return verdict;
1546 }
1547 #endif
1548 
1549 
1550 /*
1551  * Check if it's for virtual services, look it up,
1552  * and send it on its way...
1553  */
1554 static unsigned int
1555 ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1556 {
1557  struct net *net;
1558  struct ip_vs_iphdr iph;
1559  struct ip_vs_protocol *pp;
1560  struct ip_vs_proto_data *pd;
1561  struct ip_vs_conn *cp;
1562  int ret, pkts;
1563  struct netns_ipvs *ipvs;
1564 
1565  /* Already marked as IPVS request or reply? */
1566  if (skb->ipvs_property)
1567  return NF_ACCEPT;
1568 
1569  /*
1570  * Big tappo:
1571  * - remote client: only PACKET_HOST
1572  * - route: used for struct net when skb->dev is unset
1573  */
1574  if (unlikely((skb->pkt_type != PACKET_HOST &&
1575  hooknum != NF_INET_LOCAL_OUT) ||
1576  !skb_dst(skb))) {
1577  ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1578  IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1579  " ignored in hook %u\n",
1580  skb->pkt_type, iph.protocol,
1581  IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1582  return NF_ACCEPT;
1583  }
1584  /* ipvs enabled in this netns ? */
1585  net = skb_net(skb);
1586  if (!net_ipvs(net)->enable)
1587  return NF_ACCEPT;
1588 
1589  ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1590 
1591  /* Bad... Do not break raw sockets */
1592  if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1593  af == AF_INET)) {
1594  struct sock *sk = skb->sk;
1595  struct inet_sock *inet = inet_sk(skb->sk);
1596 
1597  if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1598  return NF_ACCEPT;
1599  }
1600 
1601 #ifdef CONFIG_IP_VS_IPV6
1602  if (af == AF_INET6) {
1603  if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1604  int related;
1605  int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1606 
1607  if (related)
1608  return verdict;
1609  ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1610  }
1611  } else
1612 #endif
1613  if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1614  int related;
1615  int verdict = ip_vs_in_icmp(skb, &related, hooknum);
1616 
1617  if (related)
1618  return verdict;
1619  ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1620  }
1621 
1622  /* Protocol supported? */
1623  pd = ip_vs_proto_data_get(net, iph.protocol);
1624  if (unlikely(!pd))
1625  return NF_ACCEPT;
1626  pp = pd->pp;
1627  /*
1628  * Check if the packet belongs to an existing connection entry
1629  */
1630  cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
1631 
1632  if (unlikely(!cp)) {
1633  int v;
1634 
1635  if (!pp->conn_schedule(af, skb, pd, &v, &cp))
1636  return v;
1637  }
1638 
1639  if (unlikely(!cp)) {
1640  /* sorry, all this trouble for a no-hit :) */
1641  IP_VS_DBG_PKT(12, af, pp, skb, 0,
1642  "ip_vs_in: packet continues traversal as normal");
1643  return NF_ACCEPT;
1644  }
1645 
1646  IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
1647  ipvs = net_ipvs(net);
1648  /* Check the server status */
1649  if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1650  /* the destination server is not available */
1651 
1652  if (sysctl_expire_nodest_conn(ipvs)) {
1653  /* try to expire the connection immediately */
1655  }
1656  /* don't restart its timer, and silently
1657  drop the packet. */
1658  __ip_vs_conn_put(cp);
1659  return NF_DROP;
1660  }
1661 
1662  ip_vs_in_stats(cp, skb);
1663  ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
1664  if (cp->packet_xmit)
1665  ret = cp->packet_xmit(skb, cp, pp);
1666  /* do not touch skb anymore */
1667  else {
1668  IP_VS_DBG_RL("warning: packet_xmit is null");
1669  ret = NF_ACCEPT;
1670  }
1671 
1672  /* Increase its packet counter and check if it is needed
1673  * to be synchronized
1674  *
1675  * Sync connection if it is about to close to
1676  * encorage the standby servers to update the connections timeout
1677  *
1678  * For ONE_PKT let ip_vs_sync_conn() do the filter work.
1679  */
1680 
1681  if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
1682  pkts = sysctl_sync_threshold(ipvs);
1683  else
1684  pkts = atomic_add_return(1, &cp->in_pkts);
1685 
1686  if (ipvs->sync_state & IP_VS_STATE_MASTER)
1687  ip_vs_sync_conn(net, cp, pkts);
1688 
1689  ip_vs_conn_put(cp);
1690  return ret;
1691 }
1692 
1693 /*
1694  * AF_INET handler in NF_INET_LOCAL_IN chain
1695  * Schedule and forward packets from remote clients
1696  */
1697 static unsigned int
1698 ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
1699  const struct net_device *in,
1700  const struct net_device *out,
1701  int (*okfn)(struct sk_buff *))
1702 {
1703  return ip_vs_in(hooknum, skb, AF_INET);
1704 }
1705 
1706 /*
1707  * AF_INET handler in NF_INET_LOCAL_OUT chain
1708  * Schedule and forward packets from local clients
1709  */
1710 static unsigned int
1711 ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
1712  const struct net_device *in, const struct net_device *out,
1713  int (*okfn)(struct sk_buff *))
1714 {
1715  unsigned int verdict;
1716 
1717  /* Disable BH in LOCAL_OUT until all places are fixed */
1718  local_bh_disable();
1719  verdict = ip_vs_in(hooknum, skb, AF_INET);
1720  local_bh_enable();
1721  return verdict;
1722 }
1723 
1724 #ifdef CONFIG_IP_VS_IPV6
1725 
1726 /*
1727  * AF_INET6 handler in NF_INET_LOCAL_IN chain
1728  * Schedule and forward packets from remote clients
1729  */
1730 static unsigned int
1731 ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
1732  const struct net_device *in,
1733  const struct net_device *out,
1734  int (*okfn)(struct sk_buff *))
1735 {
1736  return ip_vs_in(hooknum, skb, AF_INET6);
1737 }
1738 
1739 /*
1740  * AF_INET6 handler in NF_INET_LOCAL_OUT chain
1741  * Schedule and forward packets from local clients
1742  */
1743 static unsigned int
1744 ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
1745  const struct net_device *in, const struct net_device *out,
1746  int (*okfn)(struct sk_buff *))
1747 {
1748  unsigned int verdict;
1749 
1750  /* Disable BH in LOCAL_OUT until all places are fixed */
1751  local_bh_disable();
1752  verdict = ip_vs_in(hooknum, skb, AF_INET6);
1753  local_bh_enable();
1754  return verdict;
1755 }
1756 
1757 #endif
1758 
1759 
1760 /*
1761  * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1762  * related packets destined for 0.0.0.0/0.
1763  * When fwmark-based virtual service is used, such as transparent
1764  * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1765  * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1766  * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1767  * and send them to ip_vs_in_icmp.
1768  */
1769 static unsigned int
1770 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1771  const struct net_device *in, const struct net_device *out,
1772  int (*okfn)(struct sk_buff *))
1773 {
1774  int r;
1775  struct net *net;
1776 
1777  if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1778  return NF_ACCEPT;
1779 
1780  /* ipvs enabled in this netns ? */
1781  net = skb_net(skb);
1782  if (!net_ipvs(net)->enable)
1783  return NF_ACCEPT;
1784 
1785  return ip_vs_in_icmp(skb, &r, hooknum);
1786 }
1787 
1788 #ifdef CONFIG_IP_VS_IPV6
1789 static unsigned int
1790 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1791  const struct net_device *in, const struct net_device *out,
1792  int (*okfn)(struct sk_buff *))
1793 {
1794  int r;
1795  struct net *net;
1796 
1797  if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1798  return NF_ACCEPT;
1799 
1800  /* ipvs enabled in this netns ? */
1801  net = skb_net(skb);
1802  if (!net_ipvs(net)->enable)
1803  return NF_ACCEPT;
1804 
1805  return ip_vs_in_icmp_v6(skb, &r, hooknum);
1806 }
1807 #endif
1808 
1809 
1810 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1811  /* After packet filtering, change source only for VS/NAT */
1812  {
1813  .hook = ip_vs_reply4,
1814  .owner = THIS_MODULE,
1815  .pf = NFPROTO_IPV4,
1816  .hooknum = NF_INET_LOCAL_IN,
1817  .priority = NF_IP_PRI_NAT_SRC - 2,
1818  },
1819  /* After packet filtering, forward packet through VS/DR, VS/TUN,
1820  * or VS/NAT(change destination), so that filtering rules can be
1821  * applied to IPVS. */
1822  {
1823  .hook = ip_vs_remote_request4,
1824  .owner = THIS_MODULE,
1825  .pf = NFPROTO_IPV4,
1826  .hooknum = NF_INET_LOCAL_IN,
1827  .priority = NF_IP_PRI_NAT_SRC - 1,
1828  },
1829  /* Before ip_vs_in, change source only for VS/NAT */
1830  {
1831  .hook = ip_vs_local_reply4,
1832  .owner = THIS_MODULE,
1833  .pf = NFPROTO_IPV4,
1834  .hooknum = NF_INET_LOCAL_OUT,
1835  .priority = NF_IP_PRI_NAT_DST + 1,
1836  },
1837  /* After mangle, schedule and forward local requests */
1838  {
1839  .hook = ip_vs_local_request4,
1840  .owner = THIS_MODULE,
1841  .pf = NFPROTO_IPV4,
1842  .hooknum = NF_INET_LOCAL_OUT,
1843  .priority = NF_IP_PRI_NAT_DST + 2,
1844  },
1845  /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1846  * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1847  {
1848  .hook = ip_vs_forward_icmp,
1849  .owner = THIS_MODULE,
1850  .pf = NFPROTO_IPV4,
1851  .hooknum = NF_INET_FORWARD,
1852  .priority = 99,
1853  },
1854  /* After packet filtering, change source only for VS/NAT */
1855  {
1856  .hook = ip_vs_reply4,
1857  .owner = THIS_MODULE,
1858  .pf = NFPROTO_IPV4,
1859  .hooknum = NF_INET_FORWARD,
1860  .priority = 100,
1861  },
1862 #ifdef CONFIG_IP_VS_IPV6
1863  /* After packet filtering, change source only for VS/NAT */
1864  {
1865  .hook = ip_vs_reply6,
1866  .owner = THIS_MODULE,
1867  .pf = NFPROTO_IPV6,
1868  .hooknum = NF_INET_LOCAL_IN,
1869  .priority = NF_IP6_PRI_NAT_SRC - 2,
1870  },
1871  /* After packet filtering, forward packet through VS/DR, VS/TUN,
1872  * or VS/NAT(change destination), so that filtering rules can be
1873  * applied to IPVS. */
1874  {
1875  .hook = ip_vs_remote_request6,
1876  .owner = THIS_MODULE,
1877  .pf = NFPROTO_IPV6,
1878  .hooknum = NF_INET_LOCAL_IN,
1879  .priority = NF_IP6_PRI_NAT_SRC - 1,
1880  },
1881  /* Before ip_vs_in, change source only for VS/NAT */
1882  {
1883  .hook = ip_vs_local_reply6,
1884  .owner = THIS_MODULE,
1885  .pf = NFPROTO_IPV4,
1886  .hooknum = NF_INET_LOCAL_OUT,
1887  .priority = NF_IP6_PRI_NAT_DST + 1,
1888  },
1889  /* After mangle, schedule and forward local requests */
1890  {
1891  .hook = ip_vs_local_request6,
1892  .owner = THIS_MODULE,
1893  .pf = NFPROTO_IPV6,
1894  .hooknum = NF_INET_LOCAL_OUT,
1895  .priority = NF_IP6_PRI_NAT_DST + 2,
1896  },
1897  /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1898  * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1899  {
1900  .hook = ip_vs_forward_icmp_v6,
1901  .owner = THIS_MODULE,
1902  .pf = NFPROTO_IPV6,
1903  .hooknum = NF_INET_FORWARD,
1904  .priority = 99,
1905  },
1906  /* After packet filtering, change source only for VS/NAT */
1907  {
1908  .hook = ip_vs_reply6,
1909  .owner = THIS_MODULE,
1910  .pf = NFPROTO_IPV6,
1911  .hooknum = NF_INET_FORWARD,
1912  .priority = 100,
1913  },
1914 #endif
1915 };
1916 /*
1917  * Initialize IP Virtual Server netns mem.
1918  */
1919 static int __net_init __ip_vs_init(struct net *net)
1920 {
1921  struct netns_ipvs *ipvs;
1922 
1923  ipvs = net_generic(net, ip_vs_net_id);
1924  if (ipvs == NULL)
1925  return -ENOMEM;
1926 
1927  /* Hold the beast until a service is registerd */
1928  ipvs->enable = 0;
1929  ipvs->net = net;
1930  /* Counters used for creating unique names */
1931  ipvs->gen = atomic_read(&ipvs_netns_cnt);
1932  atomic_inc(&ipvs_netns_cnt);
1933  net->ipvs = ipvs;
1934 
1935  if (ip_vs_estimator_net_init(net) < 0)
1936  goto estimator_fail;
1937 
1938  if (ip_vs_control_net_init(net) < 0)
1939  goto control_fail;
1940 
1941  if (ip_vs_protocol_net_init(net) < 0)
1942  goto protocol_fail;
1943 
1944  if (ip_vs_app_net_init(net) < 0)
1945  goto app_fail;
1946 
1947  if (ip_vs_conn_net_init(net) < 0)
1948  goto conn_fail;
1949 
1950  if (ip_vs_sync_net_init(net) < 0)
1951  goto sync_fail;
1952 
1953  printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
1954  sizeof(struct netns_ipvs), ipvs->gen);
1955  return 0;
1956 /*
1957  * Error handling
1958  */
1959 
1960 sync_fail:
1962 conn_fail:
1963  ip_vs_app_net_cleanup(net);
1964 app_fail:
1966 protocol_fail:
1968 control_fail:
1970 estimator_fail:
1971  net->ipvs = NULL;
1972  return -ENOMEM;
1973 }
1974 
1975 static void __net_exit __ip_vs_cleanup(struct net *net)
1976 {
1977  ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */
1979  ip_vs_app_net_cleanup(net);
1983  IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
1984  net->ipvs = NULL;
1985 }
1986 
1987 static void __net_exit __ip_vs_dev_cleanup(struct net *net)
1988 {
1989  EnterFunction(2);
1990  net_ipvs(net)->enable = 0; /* Disable packet reception */
1991  smp_wmb();
1993  LeaveFunction(2);
1994 }
1995 
1996 static struct pernet_operations ipvs_core_ops = {
1997  .init = __ip_vs_init,
1998  .exit = __ip_vs_cleanup,
1999  .id = &ip_vs_net_id,
2000  .size = sizeof(struct netns_ipvs),
2001 };
2002 
2003 static struct pernet_operations ipvs_core_dev_ops = {
2004  .exit = __ip_vs_dev_cleanup,
2005 };
2006 
2007 /*
2008  * Initialize IP Virtual Server
2009  */
2010 static int __init ip_vs_init(void)
2011 {
2012  int ret;
2013 
2014  ret = ip_vs_control_init();
2015  if (ret < 0) {
2016  pr_err("can't setup control.\n");
2017  goto exit;
2018  }
2019 
2021 
2022  ret = ip_vs_conn_init();
2023  if (ret < 0) {
2024  pr_err("can't setup connection table.\n");
2025  goto cleanup_protocol;
2026  }
2027 
2028  ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
2029  if (ret < 0)
2030  goto cleanup_conn;
2031 
2032  ret = register_pernet_device(&ipvs_core_dev_ops);
2033  if (ret < 0)
2034  goto cleanup_sub;
2035 
2036  ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
2037  if (ret < 0) {
2038  pr_err("can't register hooks.\n");
2039  goto cleanup_dev;
2040  }
2041 
2042  ret = ip_vs_register_nl_ioctl();
2043  if (ret < 0) {
2044  pr_err("can't register netlink/ioctl.\n");
2045  goto cleanup_hooks;
2046  }
2047 
2048  pr_info("ipvs loaded.\n");
2049 
2050  return ret;
2051 
2052 cleanup_hooks:
2053  nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
2054 cleanup_dev:
2055  unregister_pernet_device(&ipvs_core_dev_ops);
2056 cleanup_sub:
2057  unregister_pernet_subsys(&ipvs_core_ops);
2058 cleanup_conn:
2060 cleanup_protocol:
2063 exit:
2064  return ret;
2065 }
2066 
2067 static void __exit ip_vs_cleanup(void)
2068 {
2070  nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
2071  unregister_pernet_device(&ipvs_core_dev_ops);
2072  unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
2076  pr_info("ipvs unloaded.\n");
2077 }
2078 
2079 module_init(ip_vs_init);
2080 module_exit(ip_vs_cleanup);
2081 MODULE_LICENSE("GPL");