15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17 #include <linux/kernel.h>
18 #include <linux/types.h>
19 #include <linux/module.h>
20 #include <linux/errno.h>
21 #include <linux/slab.h>
24 #include <linux/netdevice.h>
27 #include <linux/udp.h>
28 #include <linux/igmp.h>
30 #include <linux/if_ether.h>
31 #include <linux/hash.h>
42 #define VXLAN_VERSION "0.1"
44 #define VNI_HASH_BITS 10
45 #define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
46 #define FDB_HASH_BITS 8
47 #define FDB_HASH_SIZE (1<<FDB_HASH_BITS)
48 #define FDB_AGE_DEFAULT 300
49 #define FDB_AGE_INTERVAL (10 * HZ)
51 #define VXLAN_N_VID (1u << 24)
52 #define VXLAN_VID_MASK (VXLAN_N_VID - 1)
54 #define VXLAN_HEADROOM (20 + 8 + 8 + 14)
56 #define VXLAN_FLAGS 0x08000000
69 static bool log_ecn_error =
true;
74 static unsigned int vxlan_net_id;
141 hlist_for_each_entry_rcu(vxlan, node, vni_head(net,
id),
hlist) {
142 if (vxlan->
vni ==
id)
159 nlh = nlmsg_put(skb, portid, seq, type,
sizeof(*ndm), flags);
163 ndm = nlmsg_data(nlh);
164 memset(ndm, 0,
sizeof(*ndm));
172 goto nla_put_failure;
175 goto nla_put_failure;
178 ci.ndm_confirmed = 0;
183 goto nla_put_failure;
185 return nlmsg_end(skb, nlh);
188 nlmsg_cancel(skb, nlh);
192 static inline size_t vxlan_nlmsg_size(
void)
196 + nla_total_size(
sizeof(
__be32))
200 static void vxlan_fdb_notify(
struct vxlan_dev *vxlan,
203 struct net *net = dev_net(vxlan->
dev);
207 skb = nlmsg_new(vxlan_nlmsg_size(),
GFP_ATOMIC);
211 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
227 static u32 eth_hash(
const unsigned char *
addr)
244 return &vxlan->
fdb_head[eth_hash(mac)];
256 hlist_for_each_entry_rcu(f, node, head,
hlist) {
257 if (compare_ether_addr(mac, f->
eth_addr) == 0)
265 static int vxlan_fdb_create(
struct vxlan_dev *vxlan,
272 f = vxlan_find_mac(vxlan, mac);
276 "lost race to create %pM\n", mac);
279 if (f->
state != state) {
303 hlist_add_head_rcu(&f->
hlist,
304 vxlan_fdb_head(vxlan, mac));
321 hlist_del_rcu(&f->
hlist);
326 static int vxlan_fdb_add(
struct ndmsg *ndm,
struct nlattr *
tb[],
328 const unsigned char *addr,
u16 flags)
330 struct vxlan_dev *vxlan = netdev_priv(dev);
335 pr_info(
"RTM_NEWNEIGH with invalid state %#x\n",
346 ip = nla_get_be32(tb[NDA_DST]);
349 err = vxlan_fdb_create(vxlan, addr, ip, ndm->
ndm_state, flags);
357 const unsigned char *addr)
359 struct vxlan_dev *vxlan = netdev_priv(dev);
364 f = vxlan_find_mac(vxlan, addr);
366 vxlan_fdb_destroy(vxlan, f);
378 struct vxlan_dev *vxlan = netdev_priv(dev);
387 if (idx < cb->args[0])
390 err = vxlan_fdb_info(skb, vxlan, f,
408 static void vxlan_snoop(
struct net_device *dev,
411 struct vxlan_dev *vxlan = netdev_priv(dev);
415 f = vxlan_find_mac(vxlan, src_mac);
423 "%pM migrated from %pI4 to %pI4\n",
431 err = vxlan_fdb_create(vxlan, src_mac, src_ip,
433 NLM_F_EXCL|NLM_F_CREATE);
452 if (!netif_running(vxlan->
dev))
455 if (vxlan->
gaddr == this->gaddr)
463 static int vxlan_join_group(
struct net_device *dev)
465 struct vxlan_dev *vxlan = netdev_priv(dev);
474 if (vxlan_group_used(vn, vxlan))
489 static int vxlan_leave_group(
struct net_device *dev)
491 struct vxlan_dev *vxlan = netdev_priv(dev);
500 if (vxlan_group_used(vn, vxlan))
514 static int vxlan_udp_encap_recv(
struct sock *sk,
struct sk_buff *skb)
524 __skb_pull(skb,
sizeof(
struct udphdr));
527 if (!pskb_may_pull(skb,
sizeof(
struct vxlanhdr)))
539 __skb_pull(skb,
sizeof(
struct vxlanhdr));
543 vxlan = vxlan_find_vni(sock_net(sk), vni);
549 if (!pskb_may_pull(skb,
ETH_HLEN)) {
550 vxlan->
dev->stats.rx_length_errors++;
551 vxlan->
dev->stats.rx_errors++;
560 if (compare_ether_addr(eth_hdr(skb)->
h_source,
561 vxlan->
dev->dev_addr) == 0)
567 __skb_tunnel_rx(skb, vxlan->
dev);
568 skb_reset_network_header(skb);
571 err = IP_ECN_decapsulate(oip, skb);
577 ++vxlan->
dev->stats.rx_frame_errors;
578 ++vxlan->
dev->stats.rx_errors;
584 u64_stats_update_begin(&stats->
syncp);
587 u64_stats_update_end(&stats->
syncp);
594 __skb_push(skb,
sizeof(
struct udphdr));
604 static inline u8 vxlan_get_dsfield(
const struct iphdr *iph,
610 return ipv6_get_dsfield((
const struct ipv6hdr *)iph);
616 static inline u8 vxlan_ecn_encap(
u8 tos,
617 const struct iphdr *iph,
620 u8 inner = vxlan_get_dsfield(iph, skb);
622 return INET_ECN_encapsulate(tos, inner);
630 if (is_multicast_ether_addr(eth->
h_dest))
633 f = vxlan_find_mac(vxlan, eth->
h_dest);
641 static void vxlan_sock_free(
struct sk_buff *skb)
668 hash = skb_get_rxhash(skb);
673 return (((
u64) hash * range) >> 32) + vxlan->
port_min;
684 struct vxlan_dev *vxlan = netdev_priv(dev);
686 const struct iphdr *old_iph;
698 dst = vxlan_find_dst(vxlan, skb);
706 old_iph = ip_hdr(skb);
714 tos = vxlan_get_dsfield(old_iph, skb);
716 src_port = vxlan_src_port(vxlan, skb);
718 memset(&fl4, 0,
sizeof(fl4));
719 fl4.flowi4_oif = vxlan->
link;
720 fl4.flowi4_tos =
RT_TOS(tos);
722 fl4.saddr = vxlan->
saddr;
724 rt = ip_route_output_key(dev_net(dev), &fl4);
727 dev->
stats.tx_carrier_errors++;
731 if (rt->
dst.dev == dev) {
732 netdev_dbg(dev,
"circular route to %pI4\n", &dst);
734 dev->
stats.collisions++;
742 skb_dst_set(skb, &rt->
dst);
744 vxh = (
struct vxlanhdr *) __skb_push(skb,
sizeof(*vxh));
748 __skb_push(skb,
sizeof(*uh));
749 skb_reset_transport_header(skb);
758 __skb_push(skb,
sizeof(*iph));
759 skb_reset_network_header(skb);
762 iph->ihl =
sizeof(
struct iphdr) >> 2;
765 iph->
tos = vxlan_ecn_encap(tos, old_iph, skb);
767 iph->
saddr = fl4.saddr;
768 iph->
ttl = ttl ? : ip4_dst_hoplimit(&rt->
dst);
770 vxlan_set_owner(dev, skb);
774 ip_select_ident(iph, &rt->
dst,
NULL);
780 u64_stats_update_begin(&stats->
syncp);
783 u64_stats_update_end(&stats->
syncp);
785 dev->
stats.tx_errors++;
786 dev->
stats.tx_aborted_errors++;
791 dev->
stats.tx_dropped++;
795 dev->
stats.tx_errors++;
802 static void vxlan_cleanup(
unsigned long arg)
808 if (!netif_running(vxlan->
dev))
825 "garbage collect %pM\n",
828 vxlan_fdb_destroy(vxlan, f);
830 next_timer = timeout;
841 struct vxlan_dev *vxlan = netdev_priv(dev);
853 struct vxlan_dev *vxlan = netdev_priv(dev);
857 err = vxlan_join_group(dev);
869 static void vxlan_flush(
struct vxlan_dev *vxlan)
879 vxlan_fdb_destroy(vxlan, f);
888 struct vxlan_dev *vxlan = netdev_priv(dev);
891 vxlan_leave_group(dev);
904 struct vxlan_dev *vxlan = netdev_priv(dev);
914 start = u64_stats_fetch_begin_bh(&stats->
syncp);
916 }
while (u64_stats_fetch_retry_bh(&stats->
syncp, start));
929 stats->multicast = dev->
stats.multicast;
930 stats->rx_length_errors = dev->
stats.rx_length_errors;
931 stats->rx_frame_errors = dev->
stats.rx_frame_errors;
932 stats->rx_errors = dev->
stats.rx_errors;
934 stats->tx_dropped = dev->
stats.tx_dropped;
935 stats->tx_carrier_errors = dev->
stats.tx_carrier_errors;
936 stats->tx_aborted_errors = dev->
stats.tx_aborted_errors;
937 stats->collisions = dev->
stats.collisions;
938 stats->tx_errors = dev->
stats.tx_errors;
944 static void vxlan_set_multicast_list(
struct net_device *dev)
949 .ndo_init = vxlan_init,
950 .ndo_open = vxlan_open,
951 .ndo_stop = vxlan_stop,
952 .ndo_start_xmit = vxlan_xmit,
953 .ndo_get_stats64 = vxlan_stats64,
954 .ndo_set_rx_mode = vxlan_set_multicast_list,
958 .ndo_fdb_add = vxlan_fdb_add,
959 .ndo_fdb_del = vxlan_fdb_delete,
960 .ndo_fdb_dump = vxlan_fdb_dump,
968 static void vxlan_free(
struct net_device *dev)
970 struct vxlan_dev *vxlan = netdev_priv(dev);
977 static void vxlan_setup(
struct net_device *dev)
979 struct vxlan_dev *vxlan = netdev_priv(dev);
983 eth_hw_addr_random(dev);
999 vxlan->
age_timer.function = vxlan_cleanup;
1028 if (nla_len(tb[IFLA_ADDRESS]) !=
ETH_ALEN) {
1029 pr_debug(
"invalid link address (not ethernet)\n");
1033 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
1034 pr_debug(
"invalid all zero ethernet address\n");
1043 __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
1049 __be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
1051 pr_debug(
"group address is not IPv4 multicast\n");
1058 = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1061 pr_debug(
"port range %u .. %u not valid\n",
1070 static int vxlan_newlink(
struct net *net,
struct net_device *dev,
1073 struct vxlan_dev *vxlan = netdev_priv(dev);
1077 if (!data[IFLA_VXLAN_ID])
1080 vni = nla_get_u32(data[IFLA_VXLAN_ID]);
1081 if (vxlan_find_vni(net, vni)) {
1082 pr_info(
"duplicate VNI %u\n", vni);
1087 if (data[IFLA_VXLAN_GROUP])
1088 vxlan->
gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
1091 vxlan->
saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
1099 pr_info(
"ifindex %d does not exist\n", vxlan->
link);
1112 vxlan->
tos = nla_get_u8(data[IFLA_VXLAN_TOS]);
1115 vxlan->
learn =
true;
1118 vxlan->
age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
1123 vxlan->
addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
1125 if (data[IFLA_VXLAN_PORT_RANGE]) {
1127 = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1134 hlist_add_head_rcu(&vxlan->
hlist, vni_head(net, vxlan->
vni));
1141 struct vxlan_dev *vxlan = netdev_priv(dev);
1143 hlist_del_rcu(&vxlan->
hlist);
1148 static size_t vxlan_get_size(
const struct net_device *dev)
1151 return nla_total_size(
sizeof(
__u32)) +
1152 nla_total_size(
sizeof(
__be32)) +
1153 nla_total_size(
sizeof(
__u32)) +
1154 nla_total_size(
sizeof(
__be32))+
1155 nla_total_size(
sizeof(
__u8)) +
1156 nla_total_size(
sizeof(
__u8)) +
1157 nla_total_size(
sizeof(
__u8)) +
1158 nla_total_size(
sizeof(
__u32)) +
1159 nla_total_size(
sizeof(
__u32)) +
1166 const struct vxlan_dev *vxlan = netdev_priv(dev);
1172 if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->
vni))
1173 goto nla_put_failure;
1175 if (vxlan->
gaddr && nla_put_be32(skb, IFLA_VXLAN_GROUP, vxlan->
gaddr))
1176 goto nla_put_failure;
1178 if (vxlan->
link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->
link))
1179 goto nla_put_failure;
1181 if (vxlan->
saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->
saddr))
1182 goto nla_put_failure;
1185 nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->
tos) ||
1186 nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->
learn) ||
1187 nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->
age_interval) ||
1188 nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->
addrmax))
1189 goto nla_put_failure;
1191 if (
nla_put(skb, IFLA_VXLAN_PORT_RANGE,
sizeof(ports), &ports))
1192 goto nla_put_failure;
1203 .policy = vxlan_policy,
1205 .
setup = vxlan_setup,
1206 .validate = vxlan_validate,
1207 .newlink = vxlan_newlink,
1208 .dellink = vxlan_dellink,
1209 .get_size = vxlan_get_size,
1210 .fill_info = vxlan_fill_info,
1213 static __net_init int vxlan_init_net(
struct net *net)
1227 pr_debug(
"UDP socket create failed\n");
1232 sk_change_net(sk, net);
1237 sizeof(vxlan_addr));
1239 pr_debug(
"bind for UDP socket %pI4:%u (%d)\n",
1247 inet_sk(sk)->mc_loop = 0;
1250 udp_sk(sk)->encap_type = 1;
1251 udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
1260 static __net_exit void vxlan_exit_net(
struct net *net)
1271 .init = vxlan_init_net,
1272 .exit = vxlan_exit_net,
1273 .id = &vxlan_net_id,
1277 static int __init vxlan_init_module(
void)
1300 static void __exit vxlan_cleanup_module(
void)