Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
ipt_CLUSTERIP.c
Go to the documentation of this file.
1 /* Cluster IP hashmark target
2  * (C) 2003-2004 by Harald Welte <[email protected]>
3  * based on ideas of Fabio Olive Leite <[email protected]>
4  *
5  * Development of this code funded by SuSE Linux AG, http://www.suse.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  *
11  */
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 #include <linux/module.h>
14 #include <linux/proc_fs.h>
15 #include <linux/jhash.h>
16 #include <linux/bitops.h>
17 #include <linux/skbuff.h>
18 #include <linux/slab.h>
19 #include <linux/ip.h>
20 #include <linux/tcp.h>
21 #include <linux/udp.h>
22 #include <linux/icmp.h>
23 #include <linux/if_arp.h>
24 #include <linux/seq_file.h>
25 #include <linux/netfilter_arp.h>
26 #include <linux/netfilter/x_tables.h>
27 #include <linux/netfilter_ipv4/ip_tables.h>
30 #include <net/net_namespace.h>
31 #include <net/checksum.h>
32 #include <net/ip.h>
33 
34 #define CLUSTERIP_VERSION "0.8"
35 
36 MODULE_LICENSE("GPL");
37 MODULE_AUTHOR("Harald Welte <[email protected]>");
38 MODULE_DESCRIPTION("Xtables: CLUSTERIP target");
39 
41  struct list_head list; /* list of all configs */
42  atomic_t refcount; /* reference count */
43  atomic_t entries; /* number of entries/rules
44  * referencing us */
45 
46  __be32 clusterip; /* the IP address */
47  u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
48  struct net_device *dev; /* device */
49  u_int16_t num_total_nodes; /* total number of nodes */
50  unsigned long local_nodes; /* node number array */
51 
52 #ifdef CONFIG_PROC_FS
53  struct proc_dir_entry *pde; /* proc dir entry */
54 #endif
55  enum clusterip_hashmode hash_mode; /* which hashing mode */
56  u_int32_t hash_initval; /* hash initialization */
57  struct rcu_head rcu;
58 };
59 
60 static LIST_HEAD(clusterip_configs);
61 
62 /* clusterip_lock protects the clusterip_configs list */
63 static DEFINE_SPINLOCK(clusterip_lock);
64 
65 #ifdef CONFIG_PROC_FS
66 static const struct file_operations clusterip_proc_fops;
67 static struct proc_dir_entry *clusterip_procdir;
68 #endif
69 
70 static inline void
71 clusterip_config_get(struct clusterip_config *c)
72 {
73  atomic_inc(&c->refcount);
74 }
75 
76 
77 static void clusterip_config_rcu_free(struct rcu_head *head)
78 {
79  kfree(container_of(head, struct clusterip_config, rcu));
80 }
81 
82 static inline void
83 clusterip_config_put(struct clusterip_config *c)
84 {
86  call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
87 }
88 
89 /* decrease the count of entries using/referencing this config. If last
90  * entry(rule) is removed, remove the config from lists, but don't free it
91  * yet, since proc-files could still be holding references */
92 static inline void
93 clusterip_config_entry_put(struct clusterip_config *c)
94 {
96  if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
97  list_del_rcu(&c->list);
98  spin_unlock(&clusterip_lock);
100 
101  dev_mc_del(c->dev, c->clustermac);
102  dev_put(c->dev);
103 
104  /* In case anyone still accesses the file, the open/close
105  * functions are also incrementing the refcount on their own,
106  * so it's safe to remove the entry even if it's in use. */
107 #ifdef CONFIG_PROC_FS
108  remove_proc_entry(c->pde->name, c->pde->parent);
109 #endif
110  return;
111  }
112  local_bh_enable();
113 }
114 
115 static struct clusterip_config *
116 __clusterip_config_find(__be32 clusterip)
117 {
118  struct clusterip_config *c;
119 
120  list_for_each_entry_rcu(c, &clusterip_configs, list) {
121  if (c->clusterip == clusterip)
122  return c;
123  }
124 
125  return NULL;
126 }
127 
128 static inline struct clusterip_config *
129 clusterip_config_find_get(__be32 clusterip, int entry)
130 {
131  struct clusterip_config *c;
132 
133  rcu_read_lock_bh();
134  c = __clusterip_config_find(clusterip);
135  if (c) {
137  c = NULL;
138  else if (entry)
139  atomic_inc(&c->entries);
140  }
141  rcu_read_unlock_bh();
142 
143  return c;
144 }
145 
146 static void
147 clusterip_config_init_nodelist(struct clusterip_config *c,
148  const struct ipt_clusterip_tgt_info *i)
149 {
150  int n;
151 
152  for (n = 0; n < i->num_local_nodes; n++)
153  set_bit(i->local_nodes[n] - 1, &c->local_nodes);
154 }
155 
156 static struct clusterip_config *
157 clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
158  struct net_device *dev)
159 {
160  struct clusterip_config *c;
161 
162  c = kzalloc(sizeof(*c), GFP_ATOMIC);
163  if (!c)
164  return NULL;
165 
166  c->dev = dev;
167  c->clusterip = ip;
170  clusterip_config_init_nodelist(c, i);
171  c->hash_mode = i->hash_mode;
172  c->hash_initval = i->hash_initval;
173  atomic_set(&c->refcount, 1);
174  atomic_set(&c->entries, 1);
175 
176 #ifdef CONFIG_PROC_FS
177  {
178  char buffer[16];
179 
180  /* create proc dir entry */
181  sprintf(buffer, "%pI4", &ip);
182  c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
183  clusterip_procdir,
184  &clusterip_proc_fops, c);
185  if (!c->pde) {
186  kfree(c);
187  return NULL;
188  }
189  }
190 #endif
191 
192  spin_lock_bh(&clusterip_lock);
193  list_add_rcu(&c->list, &clusterip_configs);
194  spin_unlock_bh(&clusterip_lock);
195 
196  return c;
197 }
198 
199 #ifdef CONFIG_PROC_FS
200 static int
201 clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
202 {
203 
204  if (nodenum == 0 ||
205  nodenum > c->num_total_nodes)
206  return 1;
207 
208  /* check if we already have this number in our bitfield */
209  if (test_and_set_bit(nodenum - 1, &c->local_nodes))
210  return 1;
211 
212  return 0;
213 }
214 
215 static bool
216 clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
217 {
218  if (nodenum == 0 ||
219  nodenum > c->num_total_nodes)
220  return true;
221 
222  if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
223  return false;
224 
225  return true;
226 }
227 #endif
228 
229 static inline u_int32_t
230 clusterip_hashfn(const struct sk_buff *skb,
231  const struct clusterip_config *config)
232 {
233  const struct iphdr *iph = ip_hdr(skb);
234  unsigned long hashval;
235  u_int16_t sport = 0, dport = 0;
236  int poff;
237 
238  poff = proto_ports_offset(iph->protocol);
239  if (poff >= 0) {
240  const u_int16_t *ports;
241  u16 _ports[2];
242 
243  ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
244  if (ports) {
245  sport = ports[0];
246  dport = ports[1];
247  }
248  } else {
249  net_info_ratelimited("unknown protocol %u\n", iph->protocol);
250  }
251 
252  switch (config->hash_mode) {
254  hashval = jhash_1word(ntohl(iph->saddr),
255  config->hash_initval);
256  break;
258  hashval = jhash_2words(ntohl(iph->saddr), sport,
259  config->hash_initval);
260  break;
262  hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
263  config->hash_initval);
264  break;
265  default:
266  /* to make gcc happy */
267  hashval = 0;
268  /* This cannot happen, unless the check function wasn't called
269  * at rule load time */
270  pr_info("unknown mode %u\n", config->hash_mode);
271  BUG();
272  break;
273  }
274 
275  /* node numbers are 1..n, not 0..n */
276  return (((u64)hashval * config->num_total_nodes) >> 32) + 1;
277 }
278 
279 static inline int
280 clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
281 {
282  return test_bit(hash - 1, &config->local_nodes);
283 }
284 
285 /***********************************************************************
286  * IPTABLES TARGET
287  ***********************************************************************/
288 
289 static unsigned int
290 clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
291 {
292  const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
293  struct nf_conn *ct;
294  enum ip_conntrack_info ctinfo;
295  u_int32_t hash;
296 
297  /* don't need to clusterip_config_get() here, since refcount
298  * is only decremented by destroy() - and ip_tables guarantees
299  * that the ->target() function isn't called after ->destroy() */
300 
301  ct = nf_ct_get(skb, &ctinfo);
302  if (ct == NULL)
303  return NF_DROP;
304 
305  /* special case: ICMP error handling. conntrack distinguishes between
306  * error messages (RELATED) and information requests (see below) */
307  if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
308  (ctinfo == IP_CT_RELATED ||
309  ctinfo == IP_CT_RELATED_REPLY))
310  return XT_CONTINUE;
311 
312  /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
313  * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
314  * on, which all have an ID field [relevant for hashing]. */
315 
316  hash = clusterip_hashfn(skb, cipinfo->config);
317 
318  switch (ctinfo) {
319  case IP_CT_NEW:
320  ct->mark = hash;
321  break;
322  case IP_CT_RELATED:
323  case IP_CT_RELATED_REPLY:
324  /* FIXME: we don't handle expectations at the moment.
325  * They can arrive on a different node than
326  * the master connection (e.g. FTP passive mode) */
327  case IP_CT_ESTABLISHED:
329  break;
330  default: /* Prevent gcc warnings */
331  break;
332  }
333 
334 #ifdef DEBUG
335  nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
336 #endif
337  pr_debug("hash=%u ct_hash=%u ", hash, ct->mark);
338  if (!clusterip_responsible(cipinfo->config, hash)) {
339  pr_debug("not responsible\n");
340  return NF_DROP;
341  }
342  pr_debug("responsible\n");
343 
344  /* despite being received via linklayer multicast, this is
345  * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
346  skb->pkt_type = PACKET_HOST;
347 
348  return XT_CONTINUE;
349 }
350 
351 static int clusterip_tg_check(const struct xt_tgchk_param *par)
352 {
353  struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
354  const struct ipt_entry *e = par->entryinfo;
355  struct clusterip_config *config;
356  int ret;
357 
358  if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
361  pr_info("unknown mode %u\n", cipinfo->hash_mode);
362  return -EINVAL;
363 
364  }
365  if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
366  e->ip.dst.s_addr == 0) {
367  pr_info("Please specify destination IP\n");
368  return -EINVAL;
369  }
370 
371  /* FIXME: further sanity checks */
372 
373  config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
374  if (!config) {
375  if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
376  pr_info("no config found for %pI4, need 'new'\n",
377  &e->ip.dst.s_addr);
378  return -EINVAL;
379  } else {
380  struct net_device *dev;
381 
382  if (e->ip.iniface[0] == '\0') {
383  pr_info("Please specify an interface name\n");
384  return -EINVAL;
385  }
386 
387  dev = dev_get_by_name(&init_net, e->ip.iniface);
388  if (!dev) {
389  pr_info("no such interface %s\n",
390  e->ip.iniface);
391  return -ENOENT;
392  }
393 
394  config = clusterip_config_init(cipinfo,
395  e->ip.dst.s_addr, dev);
396  if (!config) {
397  dev_put(dev);
398  return -ENOMEM;
399  }
400  dev_mc_add(config->dev, config->clustermac);
401  }
402  }
403  cipinfo->config = config;
404 
406  if (ret < 0)
407  pr_info("cannot load conntrack support for proto=%u\n",
408  par->family);
409  return ret;
410 }
411 
412 /* drop reference count of cluster config when rule is deleted */
413 static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
414 {
415  const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
416 
417  /* if no more entries are referencing the config, remove it
418  * from the list and destroy the proc entry */
419  clusterip_config_entry_put(cipinfo->config);
420 
421  clusterip_config_put(cipinfo->config);
422 
424 }
425 
426 #ifdef CONFIG_COMPAT
427 struct compat_ipt_clusterip_tgt_info
428 {
430  u_int8_t clustermac[6];
431  u_int16_t num_total_nodes;
432  u_int16_t num_local_nodes;
433  u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
435  u_int32_t hash_initval;
437 };
438 #endif /* CONFIG_COMPAT */
439 
440 static struct xt_target clusterip_tg_reg __read_mostly = {
441  .name = "CLUSTERIP",
442  .family = NFPROTO_IPV4,
443  .target = clusterip_tg,
444  .checkentry = clusterip_tg_check,
445  .destroy = clusterip_tg_destroy,
446  .targetsize = sizeof(struct ipt_clusterip_tgt_info),
447 #ifdef CONFIG_COMPAT
448  .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
449 #endif /* CONFIG_COMPAT */
450  .me = THIS_MODULE
451 };
452 
453 
454 /***********************************************************************
455  * ARP MANGLING CODE
456  ***********************************************************************/
457 
458 /* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
459 struct arp_payload {
464 } __packed;
465 
466 #ifdef DEBUG
467 static void arp_print(struct arp_payload *payload)
468 {
469 #define HBUFFERLEN 30
470  char hbuffer[HBUFFERLEN];
471  int j,k;
472 
473  for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
474  hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
475  hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
476  hbuffer[k++]=':';
477  }
478  hbuffer[--k]='\0';
479 
480  pr_debug("src %pI4@%s, dst %pI4\n",
481  &payload->src_ip, hbuffer, &payload->dst_ip);
482 }
483 #endif
484 
485 static unsigned int
486 arp_mangle(unsigned int hook,
487  struct sk_buff *skb,
488  const struct net_device *in,
489  const struct net_device *out,
490  int (*okfn)(struct sk_buff *))
491 {
492  struct arphdr *arp = arp_hdr(skb);
493  struct arp_payload *payload;
494  struct clusterip_config *c;
495 
496  /* we don't care about non-ethernet and non-ipv4 ARP */
497  if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
498  arp->ar_pro != htons(ETH_P_IP) ||
499  arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
500  return NF_ACCEPT;
501 
502  /* we only want to mangle arp requests and replies */
503  if (arp->ar_op != htons(ARPOP_REPLY) &&
504  arp->ar_op != htons(ARPOP_REQUEST))
505  return NF_ACCEPT;
506 
507  payload = (void *)(arp+1);
508 
509  /* if there is no clusterip configuration for the arp reply's
510  * source ip, we don't want to mangle it */
511  c = clusterip_config_find_get(payload->src_ip, 0);
512  if (!c)
513  return NF_ACCEPT;
514 
515  /* normally the linux kernel always replies to arp queries of
516  * addresses on different interfacs. However, in the CLUSTERIP case
517  * this wouldn't work, since we didn't subscribe the mcast group on
518  * other interfaces */
519  if (c->dev != out) {
520  pr_debug("not mangling arp reply on different "
521  "interface: cip'%s'-skb'%s'\n",
522  c->dev->name, out->name);
523  clusterip_config_put(c);
524  return NF_ACCEPT;
525  }
526 
527  /* mangle reply hardware address */
528  memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
529 
530 #ifdef DEBUG
531  pr_debug("mangled arp reply: ");
532  arp_print(payload);
533 #endif
534 
535  clusterip_config_put(c);
536 
537  return NF_ACCEPT;
538 }
539 
540 static struct nf_hook_ops cip_arp_ops __read_mostly = {
541  .hook = arp_mangle,
542  .pf = NFPROTO_ARP,
543  .hooknum = NF_ARP_OUT,
544  .priority = -1
545 };
546 
547 /***********************************************************************
548  * PROC DIR HANDLING
549  ***********************************************************************/
550 
551 #ifdef CONFIG_PROC_FS
552 
553 struct clusterip_seq_position {
554  unsigned int pos; /* position */
555  unsigned int weight; /* number of bits set == size */
556  unsigned int bit; /* current bit */
557  unsigned long val; /* current value */
558 };
559 
560 static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
561 {
562  struct clusterip_config *c = s->private;
563  unsigned int weight;
565  struct clusterip_seq_position *idx;
566 
567  /* FIXME: possible race */
568  local_nodes = c->local_nodes;
569  weight = hweight32(local_nodes);
570  if (*pos >= weight)
571  return NULL;
572 
573  idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
574  if (!idx)
575  return ERR_PTR(-ENOMEM);
576 
577  idx->pos = *pos;
578  idx->weight = weight;
579  idx->bit = ffs(local_nodes);
580  idx->val = local_nodes;
581  clear_bit(idx->bit - 1, &idx->val);
582 
583  return idx;
584 }
585 
586 static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
587 {
588  struct clusterip_seq_position *idx = v;
589 
590  *pos = ++idx->pos;
591  if (*pos >= idx->weight) {
592  kfree(v);
593  return NULL;
594  }
595  idx->bit = ffs(idx->val);
596  clear_bit(idx->bit - 1, &idx->val);
597  return idx;
598 }
599 
600 static void clusterip_seq_stop(struct seq_file *s, void *v)
601 {
602  if (!IS_ERR(v))
603  kfree(v);
604 }
605 
606 static int clusterip_seq_show(struct seq_file *s, void *v)
607 {
608  struct clusterip_seq_position *idx = v;
609 
610  if (idx->pos != 0)
611  seq_putc(s, ',');
612 
613  seq_printf(s, "%u", idx->bit);
614 
615  if (idx->pos == idx->weight - 1)
616  seq_putc(s, '\n');
617 
618  return 0;
619 }
620 
621 static const struct seq_operations clusterip_seq_ops = {
622  .start = clusterip_seq_start,
623  .next = clusterip_seq_next,
624  .stop = clusterip_seq_stop,
625  .show = clusterip_seq_show,
626 };
627 
628 static int clusterip_proc_open(struct inode *inode, struct file *file)
629 {
630  int ret = seq_open(file, &clusterip_seq_ops);
631 
632  if (!ret) {
633  struct seq_file *sf = file->private_data;
634  struct clusterip_config *c = PDE(inode)->data;
635 
636  sf->private = c;
637 
638  clusterip_config_get(c);
639  }
640 
641  return ret;
642 }
643 
644 static int clusterip_proc_release(struct inode *inode, struct file *file)
645 {
646  struct clusterip_config *c = PDE(inode)->data;
647  int ret;
648 
649  ret = seq_release(inode, file);
650 
651  if (!ret)
652  clusterip_config_put(c);
653 
654  return ret;
655 }
656 
657 static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
658  size_t size, loff_t *ofs)
659 {
660  struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data;
661 #define PROC_WRITELEN 10
662  char buffer[PROC_WRITELEN+1];
663  unsigned long nodenum;
664 
665  if (size > PROC_WRITELEN)
666  return -EIO;
667  if (copy_from_user(buffer, input, size))
668  return -EFAULT;
669  buffer[size] = 0;
670 
671  if (*buffer == '+') {
672  nodenum = simple_strtoul(buffer+1, NULL, 10);
673  if (clusterip_add_node(c, nodenum))
674  return -ENOMEM;
675  } else if (*buffer == '-') {
676  nodenum = simple_strtoul(buffer+1, NULL,10);
677  if (clusterip_del_node(c, nodenum))
678  return -ENOENT;
679  } else
680  return -EIO;
681 
682  return size;
683 }
684 
685 static const struct file_operations clusterip_proc_fops = {
686  .owner = THIS_MODULE,
687  .open = clusterip_proc_open,
688  .read = seq_read,
689  .write = clusterip_proc_write,
690  .llseek = seq_lseek,
691  .release = clusterip_proc_release,
692 };
693 
694 #endif /* CONFIG_PROC_FS */
695 
696 static int __init clusterip_tg_init(void)
697 {
698  int ret;
699 
700  ret = xt_register_target(&clusterip_tg_reg);
701  if (ret < 0)
702  return ret;
703 
704  ret = nf_register_hook(&cip_arp_ops);
705  if (ret < 0)
706  goto cleanup_target;
707 
708 #ifdef CONFIG_PROC_FS
709  clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
710  if (!clusterip_procdir) {
711  pr_err("Unable to proc dir entry\n");
712  ret = -ENOMEM;
713  goto cleanup_hook;
714  }
715 #endif /* CONFIG_PROC_FS */
716 
717  pr_info("ClusterIP Version %s loaded successfully\n",
719  return 0;
720 
721 #ifdef CONFIG_PROC_FS
722 cleanup_hook:
723  nf_unregister_hook(&cip_arp_ops);
724 #endif /* CONFIG_PROC_FS */
725 cleanup_target:
726  xt_unregister_target(&clusterip_tg_reg);
727  return ret;
728 }
729 
730 static void __exit clusterip_tg_exit(void)
731 {
732  pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
733 #ifdef CONFIG_PROC_FS
734  remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
735 #endif
736  nf_unregister_hook(&cip_arp_ops);
737  xt_unregister_target(&clusterip_tg_reg);
738 
739  /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
740  rcu_barrier_bh();
741 }
742 
743 module_init(clusterip_tg_init);
744 module_exit(clusterip_tg_exit);