Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
macvtap.c
Go to the documentation of this file.
1 #include <linux/etherdevice.h>
2 #include <linux/if_macvlan.h>
3 #include <linux/if_vlan.h>
4 #include <linux/interrupt.h>
5 #include <linux/nsproxy.h>
6 #include <linux/compat.h>
7 #include <linux/if_tun.h>
8 #include <linux/module.h>
9 #include <linux/skbuff.h>
10 #include <linux/cache.h>
11 #include <linux/sched.h>
12 #include <linux/types.h>
13 #include <linux/slab.h>
14 #include <linux/init.h>
15 #include <linux/wait.h>
16 #include <linux/cdev.h>
17 #include <linux/idr.h>
18 #include <linux/fs.h>
19 
20 #include <net/net_namespace.h>
21 #include <net/rtnetlink.h>
22 #include <net/sock.h>
23 #include <linux/virtio_net.h>
24 
25 /*
26  * A macvtap queue is the central object of this driver, it connects
27  * an open character device to a macvlan interface. There can be
28  * multiple queues on one interface, which map back to queues
29  * implemented in hardware on the underlying device.
30  *
31  * macvtap_proto is used to allocate queues through the sock allocation
32  * mechanism.
33  *
34  * TODO: multiqueue support is currently not implemented, even though
35  * macvtap is basically prepared for that. We will need to add this
36  * here as well as in virtio-net and qemu to get line rate on 10gbit
37  * adapters from a guest.
38  */
39 struct macvtap_queue {
40  struct sock sk;
41  struct socket sock;
42  struct socket_wq wq;
45  struct file *file;
46  unsigned int flags;
47 };
48 
49 static struct proto macvtap_proto = {
50  .name = "macvtap",
51  .owner = THIS_MODULE,
52  .obj_size = sizeof (struct macvtap_queue),
53 };
54 
55 /*
56  * Variables for dealing with macvtaps device numbers.
57  */
58 static dev_t macvtap_major;
59 #define MACVTAP_NUM_DEVS (1U << MINORBITS)
60 static DEFINE_MUTEX(minor_lock);
61 static DEFINE_IDR(minor_idr);
62 
63 #define GOODCOPY_LEN 128
64 static struct class *macvtap_class;
65 static struct cdev macvtap_cdev;
66 
67 static const struct proto_ops macvtap_socket_ops;
68 
69 /*
70  * RCU usage:
71  * The macvtap_queue and the macvlan_dev are loosely coupled, the
72  * pointers from one to the other can only be read while rcu_read_lock
73  * or macvtap_lock is held.
74  *
75  * Both the file and the macvlan_dev hold a reference on the macvtap_queue
76  * through sock_hold(&q->sk). When the macvlan_dev goes away first,
77  * q->vlan becomes inaccessible. When the files gets closed,
78  * macvtap_get_queue() fails.
79  *
80  * There may still be references to the struct sock inside of the
81  * queue from outbound SKBs, but these never reference back to the
82  * file or the dev. The data structure is freed through __sk_free
83  * when both our references and any pending SKBs are gone.
84  */
85 static DEFINE_SPINLOCK(macvtap_lock);
86 
87 /*
88  * get_slot: return a [unused/occupied] slot in vlan->taps[]:
89  * - if 'q' is NULL, return the first empty slot;
90  * - otherwise, return the slot this pointer occupies.
91  */
92 static int get_slot(struct macvlan_dev *vlan, struct macvtap_queue *q)
93 {
94  int i;
95 
96  for (i = 0; i < MAX_MACVTAP_QUEUES; i++) {
97  if (rcu_dereference_protected(vlan->taps[i],
98  lockdep_is_held(&macvtap_lock)) == q)
99  return i;
100  }
101 
102  /* Should never happen */
103  BUG_ON(1);
104 }
105 
106 static int macvtap_set_queue(struct net_device *dev, struct file *file,
107  struct macvtap_queue *q)
108 {
109  struct macvlan_dev *vlan = netdev_priv(dev);
110  int index;
111  int err = -EBUSY;
112 
113  spin_lock(&macvtap_lock);
114  if (vlan->numvtaps == MAX_MACVTAP_QUEUES)
115  goto out;
116 
117  err = 0;
118  index = get_slot(vlan, NULL);
119  rcu_assign_pointer(q->vlan, vlan);
120  rcu_assign_pointer(vlan->taps[index], q);
121  sock_hold(&q->sk);
122 
123  q->file = file;
124  file->private_data = q;
125 
126  vlan->numvtaps++;
127 
128 out:
129  spin_unlock(&macvtap_lock);
130  return err;
131 }
132 
133 /*
134  * The file owning the queue got closed, give up both
135  * the reference that the files holds as well as the
136  * one from the macvlan_dev if that still exists.
137  *
138  * Using the spinlock makes sure that we don't get
139  * to the queue again after destroying it.
140  */
141 static void macvtap_put_queue(struct macvtap_queue *q)
142 {
143  struct macvlan_dev *vlan;
144 
145  spin_lock(&macvtap_lock);
147  lockdep_is_held(&macvtap_lock));
148  if (vlan) {
149  int index = get_slot(vlan, q);
150 
151  RCU_INIT_POINTER(vlan->taps[index], NULL);
153  sock_put(&q->sk);
154  --vlan->numvtaps;
155  }
156 
157  spin_unlock(&macvtap_lock);
158 
159  synchronize_rcu();
160  sock_put(&q->sk);
161 }
162 
163 /*
164  * Select a queue based on the rxq of the device on which this packet
165  * arrived. If the incoming device is not mq, calculate a flow hash
166  * to select a queue. If all fails, find the first available queue.
167  * Cache vlan->numvtaps since it can become zero during the execution
168  * of this function.
169  */
170 static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
171  struct sk_buff *skb)
172 {
173  struct macvlan_dev *vlan = netdev_priv(dev);
174  struct macvtap_queue *tap = NULL;
175  int numvtaps = vlan->numvtaps;
176  __u32 rxq;
177 
178  if (!numvtaps)
179  goto out;
180 
181  /* Check if we can use flow to select a queue */
182  rxq = skb_get_rxhash(skb);
183  if (rxq) {
184  tap = rcu_dereference(vlan->taps[rxq % numvtaps]);
185  if (tap)
186  goto out;
187  }
188 
189  if (likely(skb_rx_queue_recorded(skb))) {
190  rxq = skb_get_rx_queue(skb);
191 
192  while (unlikely(rxq >= numvtaps))
193  rxq -= numvtaps;
194 
195  tap = rcu_dereference(vlan->taps[rxq]);
196  if (tap)
197  goto out;
198  }
199 
200  /* Everything failed - find first available queue */
201  for (rxq = 0; rxq < MAX_MACVTAP_QUEUES; rxq++) {
202  tap = rcu_dereference(vlan->taps[rxq]);
203  if (tap)
204  break;
205  }
206 
207 out:
208  return tap;
209 }
210 
211 /*
212  * The net_device is going away, give up the reference
213  * that it holds on all queues and safely set the pointer
214  * from the queues to NULL.
215  */
216 static void macvtap_del_queues(struct net_device *dev)
217 {
218  struct macvlan_dev *vlan = netdev_priv(dev);
219  struct macvtap_queue *q, *qlist[MAX_MACVTAP_QUEUES];
220  int i, j = 0;
221 
222  /* macvtap_put_queue can free some slots, so go through all slots */
223  spin_lock(&macvtap_lock);
224  for (i = 0; i < MAX_MACVTAP_QUEUES && vlan->numvtaps; i++) {
225  q = rcu_dereference_protected(vlan->taps[i],
226  lockdep_is_held(&macvtap_lock));
227  if (q) {
228  qlist[j++] = q;
229  RCU_INIT_POINTER(vlan->taps[i], NULL);
231  vlan->numvtaps--;
232  }
233  }
234  BUG_ON(vlan->numvtaps != 0);
235  /* guarantee that any future macvtap_set_queue will fail */
237  spin_unlock(&macvtap_lock);
238 
239  synchronize_rcu();
240 
241  for (--j; j >= 0; j--)
242  sock_put(&qlist[j]->sk);
243 }
244 
245 /*
246  * Forward happens for data that gets sent from one macvlan
247  * endpoint to another one in bridge mode. We just take
248  * the skb and put it into the receive queue.
249  */
250 static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
251 {
252  struct macvtap_queue *q = macvtap_get_queue(dev, skb);
253  if (!q)
254  goto drop;
255 
256  if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len)
257  goto drop;
258 
259  skb_queue_tail(&q->sk.sk_receive_queue, skb);
261  return NET_RX_SUCCESS;
262 
263 drop:
264  kfree_skb(skb);
265  return NET_RX_DROP;
266 }
267 
268 /*
269  * Receive is for data from the external interface (lowerdev),
270  * in case of macvtap, we can treat that the same way as
271  * forward, which macvlan cannot.
272  */
273 static int macvtap_receive(struct sk_buff *skb)
274 {
275  skb_push(skb, ETH_HLEN);
276  return macvtap_forward(skb->dev, skb);
277 }
278 
279 static int macvtap_get_minor(struct macvlan_dev *vlan)
280 {
281  int retval = -ENOMEM;
282  int id;
283 
284  mutex_lock(&minor_lock);
285  if (idr_pre_get(&minor_idr, GFP_KERNEL) == 0)
286  goto exit;
287 
288  retval = idr_get_new_above(&minor_idr, vlan, 1, &id);
289  if (retval < 0) {
290  if (retval == -EAGAIN)
291  retval = -ENOMEM;
292  goto exit;
293  }
294  if (id < MACVTAP_NUM_DEVS) {
295  vlan->minor = id;
296  } else {
297  printk(KERN_ERR "too many macvtap devices\n");
298  retval = -EINVAL;
299  idr_remove(&minor_idr, id);
300  }
301 exit:
302  mutex_unlock(&minor_lock);
303  return retval;
304 }
305 
306 static void macvtap_free_minor(struct macvlan_dev *vlan)
307 {
308  mutex_lock(&minor_lock);
309  if (vlan->minor) {
310  idr_remove(&minor_idr, vlan->minor);
311  vlan->minor = 0;
312  }
313  mutex_unlock(&minor_lock);
314 }
315 
316 static struct net_device *dev_get_by_macvtap_minor(int minor)
317 {
318  struct net_device *dev = NULL;
319  struct macvlan_dev *vlan;
320 
321  mutex_lock(&minor_lock);
322  vlan = idr_find(&minor_idr, minor);
323  if (vlan) {
324  dev = vlan->dev;
325  dev_hold(dev);
326  }
327  mutex_unlock(&minor_lock);
328  return dev;
329 }
330 
331 static int macvtap_newlink(struct net *src_net,
332  struct net_device *dev,
333  struct nlattr *tb[],
334  struct nlattr *data[])
335 {
336  /* Don't put anything that may fail after macvlan_common_newlink
337  * because we can't undo what it does.
338  */
339  return macvlan_common_newlink(src_net, dev, tb, data,
340  macvtap_receive, macvtap_forward);
341 }
342 
343 static void macvtap_dellink(struct net_device *dev,
344  struct list_head *head)
345 {
346  macvtap_del_queues(dev);
347  macvlan_dellink(dev, head);
348 }
349 
350 static void macvtap_setup(struct net_device *dev)
351 {
354 }
355 
356 static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
357  .kind = "macvtap",
358  .setup = macvtap_setup,
359  .newlink = macvtap_newlink,
360  .dellink = macvtap_dellink,
361 };
362 
363 
364 static void macvtap_sock_write_space(struct sock *sk)
365 {
366  wait_queue_head_t *wqueue;
367 
368  if (!sock_writeable(sk) ||
370  return;
371 
372  wqueue = sk_sleep(sk);
373  if (wqueue && waitqueue_active(wqueue))
375 }
376 
377 static void macvtap_sock_destruct(struct sock *sk)
378 {
380 }
381 
382 static int macvtap_open(struct inode *inode, struct file *file)
383 {
384  struct net *net = current->nsproxy->net_ns;
385  struct net_device *dev = dev_get_by_macvtap_minor(iminor(inode));
386  struct macvtap_queue *q;
387  int err;
388 
389  err = -ENODEV;
390  if (!dev)
391  goto out;
392 
393  err = -ENOMEM;
394  q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
395  &macvtap_proto);
396  if (!q)
397  goto out;
398 
399  q->sock.wq = &q->wq;
400  init_waitqueue_head(&q->wq.wait);
401  q->sock.type = SOCK_RAW;
402  q->sock.state = SS_CONNECTED;
403  q->sock.file = file;
404  q->sock.ops = &macvtap_socket_ops;
405  sock_init_data(&q->sock, &q->sk);
406  q->sk.sk_write_space = macvtap_sock_write_space;
407  q->sk.sk_destruct = macvtap_sock_destruct;
409  q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
410 
411  /*
412  * so far only KVM virtio_net uses macvtap, enable zero copy between
413  * guest kernel and host kernel when lower device supports zerocopy
414  *
415  * The macvlan supports zerocopy iff the lower device supports zero
416  * copy so we don't have to look at the lower device directly.
417  */
418  if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
419  sock_set_flag(&q->sk, SOCK_ZEROCOPY);
420 
421  err = macvtap_set_queue(dev, file, q);
422  if (err)
423  sock_put(&q->sk);
424 
425 out:
426  if (dev)
427  dev_put(dev);
428 
429  return err;
430 }
431 
432 static int macvtap_release(struct inode *inode, struct file *file)
433 {
434  struct macvtap_queue *q = file->private_data;
435  macvtap_put_queue(q);
436  return 0;
437 }
438 
439 static unsigned int macvtap_poll(struct file *file, poll_table * wait)
440 {
441  struct macvtap_queue *q = file->private_data;
442  unsigned int mask = POLLERR;
443 
444  if (!q)
445  goto out;
446 
447  mask = 0;
448  poll_wait(file, &q->wq.wait, wait);
449 
450  if (!skb_queue_empty(&q->sk.sk_receive_queue))
451  mask |= POLLIN | POLLRDNORM;
452 
453  if (sock_writeable(&q->sk) ||
454  (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
455  sock_writeable(&q->sk)))
456  mask |= POLLOUT | POLLWRNORM;
457 
458 out:
459  return mask;
460 }
461 
462 static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
463  size_t len, size_t linear,
464  int noblock, int *err)
465 {
466  struct sk_buff *skb;
467 
468  /* Under a page? Don't bother with paged skb. */
469  if (prepad + len < PAGE_SIZE || !linear)
470  linear = len;
471 
472  skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
473  err);
474  if (!skb)
475  return NULL;
476 
477  skb_reserve(skb, prepad);
478  skb_put(skb, linear);
479  skb->data_len = len - linear;
480  skb->len += len - linear;
481 
482  return skb;
483 }
484 
485 /* set skb frags from iovec, this can move to core network code for reuse */
486 static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
487  int offset, size_t count)
488 {
489  int len = iov_length(from, count) - offset;
490  int copy = skb_headlen(skb);
491  int size, offset1 = 0;
492  int i = 0;
493 
494  /* Skip over from offset */
495  while (count && (offset >= from->iov_len)) {
496  offset -= from->iov_len;
497  ++from;
498  --count;
499  }
500 
501  /* copy up to skb headlen */
502  while (count && (copy > 0)) {
503  size = min_t(unsigned int, copy, from->iov_len - offset);
504  if (copy_from_user(skb->data + offset1, from->iov_base + offset,
505  size))
506  return -EFAULT;
507  if (copy > size) {
508  ++from;
509  --count;
510  offset = 0;
511  } else
512  offset += size;
513  copy -= size;
514  offset1 += size;
515  }
516 
517  if (len == offset1)
518  return 0;
519 
520  while (count--) {
521  struct page *page[MAX_SKB_FRAGS];
522  int num_pages;
523  unsigned long base;
524  unsigned long truesize;
525 
526  len = from->iov_len - offset;
527  if (!len) {
528  offset = 0;
529  ++from;
530  continue;
531  }
532  base = (unsigned long)from->iov_base + offset;
533  size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
534  if (i + size > MAX_SKB_FRAGS)
535  return -EMSGSIZE;
536  num_pages = get_user_pages_fast(base, size, 0, &page[i]);
537  if (num_pages != size) {
538  for (i = 0; i < num_pages; i++)
539  put_page(page[i]);
540  return -EFAULT;
541  }
542  truesize = size * PAGE_SIZE;
543  skb->data_len += len;
544  skb->len += len;
545  skb->truesize += truesize;
546  atomic_add(truesize, &skb->sk->sk_wmem_alloc);
547  while (len) {
548  int off = base & ~PAGE_MASK;
549  int size = min_t(int, len, PAGE_SIZE - off);
550  __skb_fill_page_desc(skb, i, page[i], off, size);
551  skb_shinfo(skb)->nr_frags++;
552  /* increase sk_wmem_alloc */
553  base += size;
554  len -= size;
555  i++;
556  }
557  offset = 0;
558  ++from;
559  }
560  return 0;
561 }
562 
563 /*
564  * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
565  * be shared with the tun/tap driver.
566  */
567 static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb,
568  struct virtio_net_hdr *vnet_hdr)
569 {
570  unsigned short gso_type = 0;
571  if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
572  switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
574  gso_type = SKB_GSO_TCPV4;
575  break;
577  gso_type = SKB_GSO_TCPV6;
578  break;
580  gso_type = SKB_GSO_UDP;
581  break;
582  default:
583  return -EINVAL;
584  }
585 
586  if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
587  gso_type |= SKB_GSO_TCP_ECN;
588 
589  if (vnet_hdr->gso_size == 0)
590  return -EINVAL;
591  }
592 
593  if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
594  if (!skb_partial_csum_set(skb, vnet_hdr->csum_start,
595  vnet_hdr->csum_offset))
596  return -EINVAL;
597  }
598 
599  if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
600  skb_shinfo(skb)->gso_size = vnet_hdr->gso_size;
601  skb_shinfo(skb)->gso_type = gso_type;
602 
603  /* Header must be checked, and gso_segs computed. */
604  skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
605  skb_shinfo(skb)->gso_segs = 0;
606  }
607  return 0;
608 }
609 
610 static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
611  struct virtio_net_hdr *vnet_hdr)
612 {
613  memset(vnet_hdr, 0, sizeof(*vnet_hdr));
614 
615  if (skb_is_gso(skb)) {
616  struct skb_shared_info *sinfo = skb_shinfo(skb);
617 
618  /* This is a hint as to how much should be linear. */
619  vnet_hdr->hdr_len = skb_headlen(skb);
620  vnet_hdr->gso_size = sinfo->gso_size;
621  if (sinfo->gso_type & SKB_GSO_TCPV4)
623  else if (sinfo->gso_type & SKB_GSO_TCPV6)
625  else if (sinfo->gso_type & SKB_GSO_UDP)
626  vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
627  else
628  BUG();
629  if (sinfo->gso_type & SKB_GSO_TCP_ECN)
630  vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
631  } else
632  vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
633 
634  if (skb->ip_summed == CHECKSUM_PARTIAL) {
636  vnet_hdr->csum_start = skb_checksum_start_offset(skb);
637  vnet_hdr->csum_offset = skb->csum_offset;
638  } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
640  } /* else everything is zero */
641 
642  return 0;
643 }
644 
645 
646 /* Get packet from user space buffer */
647 static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
648  const struct iovec *iv, unsigned long total_len,
649  size_t count, int noblock)
650 {
651  struct sk_buff *skb;
652  struct macvlan_dev *vlan;
653  unsigned long len = total_len;
654  int err;
655  struct virtio_net_hdr vnet_hdr = { 0 };
656  int vnet_hdr_len = 0;
657  int copylen = 0;
658  bool zerocopy = false;
659 
660  if (q->flags & IFF_VNET_HDR) {
661  vnet_hdr_len = q->vnet_hdr_sz;
662 
663  err = -EINVAL;
664  if (len < vnet_hdr_len)
665  goto err;
666  len -= vnet_hdr_len;
667 
668  err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0,
669  sizeof(vnet_hdr));
670  if (err < 0)
671  goto err;
672  if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
673  vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
674  vnet_hdr.hdr_len)
675  vnet_hdr.hdr_len = vnet_hdr.csum_start +
676  vnet_hdr.csum_offset + 2;
677  err = -EINVAL;
678  if (vnet_hdr.hdr_len > len)
679  goto err;
680  }
681 
682  err = -EINVAL;
683  if (unlikely(len < ETH_HLEN))
684  goto err;
685 
686  err = -EMSGSIZE;
687  if (unlikely(count > UIO_MAXIOV))
688  goto err;
689 
690  if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY))
691  zerocopy = true;
692 
693  if (zerocopy) {
694  /* Userspace may produce vectors with count greater than
695  * MAX_SKB_FRAGS, so we need to linearize parts of the skb
696  * to let the rest of data to be fit in the frags.
697  */
698  if (count > MAX_SKB_FRAGS) {
699  copylen = iov_length(iv, count - MAX_SKB_FRAGS);
700  if (copylen < vnet_hdr_len)
701  copylen = 0;
702  else
703  copylen -= vnet_hdr_len;
704  }
705  /* There are 256 bytes to be copied in skb, so there is enough
706  * room for skb expand head in case it is used.
707  * The rest buffer is mapped from userspace.
708  */
709  if (copylen < vnet_hdr.hdr_len)
710  copylen = vnet_hdr.hdr_len;
711  if (!copylen)
712  copylen = GOODCOPY_LEN;
713  } else
714  copylen = len;
715 
716  skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
717  vnet_hdr.hdr_len, noblock, &err);
718  if (!skb)
719  goto err;
720 
721  if (zerocopy)
722  err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
723  else
724  err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
725  len);
726  if (err)
727  goto err_kfree;
728 
729  skb_set_network_header(skb, ETH_HLEN);
730  skb_reset_mac_header(skb);
731  skb->protocol = eth_hdr(skb)->h_proto;
732 
733  if (vnet_hdr_len) {
734  err = macvtap_skb_from_vnet_hdr(skb, &vnet_hdr);
735  if (err)
736  goto err_kfree;
737  }
738 
739  rcu_read_lock_bh();
740  vlan = rcu_dereference_bh(q->vlan);
741  /* copy skb_ubuf_info for callback when skb has no error */
742  if (zerocopy) {
743  skb_shinfo(skb)->destructor_arg = m->msg_control;
744  skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
745  }
746  if (vlan)
747  macvlan_start_xmit(skb, vlan->dev);
748  else
749  kfree_skb(skb);
750  rcu_read_unlock_bh();
751 
752  return total_len;
753 
754 err_kfree:
755  kfree_skb(skb);
756 
757 err:
758  rcu_read_lock_bh();
759  vlan = rcu_dereference_bh(q->vlan);
760  if (vlan)
761  vlan->dev->stats.tx_dropped++;
762  rcu_read_unlock_bh();
763 
764  return err;
765 }
766 
767 static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
768  unsigned long count, loff_t pos)
769 {
770  struct file *file = iocb->ki_filp;
772  struct macvtap_queue *q = file->private_data;
773 
774  result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count,
775  file->f_flags & O_NONBLOCK);
776  return result;
777 }
778 
779 /* Put packet to the user space buffer */
780 static ssize_t macvtap_put_user(struct macvtap_queue *q,
781  const struct sk_buff *skb,
782  const struct iovec *iv, int len)
783 {
784  struct macvlan_dev *vlan;
785  int ret;
786  int vnet_hdr_len = 0;
787  int vlan_offset = 0;
788  int copied;
789 
790  if (q->flags & IFF_VNET_HDR) {
791  struct virtio_net_hdr vnet_hdr;
792  vnet_hdr_len = q->vnet_hdr_sz;
793  if ((len -= vnet_hdr_len) < 0)
794  return -EINVAL;
795 
796  ret = macvtap_skb_to_vnet_hdr(skb, &vnet_hdr);
797  if (ret)
798  return ret;
799 
800  if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
801  return -EFAULT;
802  }
803  copied = vnet_hdr_len;
804 
805  if (!vlan_tx_tag_present(skb))
806  len = min_t(int, skb->len, len);
807  else {
808  int copy;
809  struct {
810  __be16 h_vlan_proto;
811  __be16 h_vlan_TCI;
812  } veth;
813  veth.h_vlan_proto = htons(ETH_P_8021Q);
814  veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
815 
816  vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
817  len = min_t(int, skb->len + VLAN_HLEN, len);
818 
819  copy = min_t(int, vlan_offset, len);
820  ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
821  len -= copy;
822  copied += copy;
823  if (ret || !len)
824  goto done;
825 
826  copy = min_t(int, sizeof(veth), len);
827  ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
828  len -= copy;
829  copied += copy;
830  if (ret || !len)
831  goto done;
832  }
833 
834  ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
835  copied += len;
836 
837 done:
838  rcu_read_lock_bh();
839  vlan = rcu_dereference_bh(q->vlan);
840  if (vlan)
841  macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0);
842  rcu_read_unlock_bh();
843 
844  return ret ? ret : copied;
845 }
846 
847 static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
848  const struct iovec *iv, unsigned long len,
849  int noblock)
850 {
851  DEFINE_WAIT(wait);
852  struct sk_buff *skb;
853  ssize_t ret = 0;
854 
855  while (len) {
856  prepare_to_wait(sk_sleep(&q->sk), &wait, TASK_INTERRUPTIBLE);
857 
858  /* Read frames from the queue */
859  skb = skb_dequeue(&q->sk.sk_receive_queue);
860  if (!skb) {
861  if (noblock) {
862  ret = -EAGAIN;
863  break;
864  }
865  if (signal_pending(current)) {
866  ret = -ERESTARTSYS;
867  break;
868  }
869  /* Nothing to read, let's sleep */
870  schedule();
871  continue;
872  }
873  ret = macvtap_put_user(q, skb, iv, len);
874  kfree_skb(skb);
875  break;
876  }
877 
878  finish_wait(sk_sleep(&q->sk), &wait);
879  return ret;
880 }
881 
882 static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
883  unsigned long count, loff_t pos)
884 {
885  struct file *file = iocb->ki_filp;
886  struct macvtap_queue *q = file->private_data;
887  ssize_t len, ret = 0;
888 
889  len = iov_length(iv, count);
890  if (len < 0) {
891  ret = -EINVAL;
892  goto out;
893  }
894 
895  ret = macvtap_do_read(q, iocb, iv, len, file->f_flags & O_NONBLOCK);
896  ret = min_t(ssize_t, ret, len); /* XXX copied from tun.c. Why? */
897 out:
898  return ret;
899 }
900 
901 /*
902  * provide compatibility with generic tun/tap interface
903  */
904 static long macvtap_ioctl(struct file *file, unsigned int cmd,
905  unsigned long arg)
906 {
907  struct macvtap_queue *q = file->private_data;
908  struct macvlan_dev *vlan;
909  void __user *argp = (void __user *)arg;
910  struct ifreq __user *ifr = argp;
911  unsigned int __user *up = argp;
912  unsigned int u;
913  int __user *sp = argp;
914  int s;
915  int ret;
916 
917  switch (cmd) {
918  case TUNSETIFF:
919  /* ignore the name, just look at flags */
920  if (get_user(u, &ifr->ifr_flags))
921  return -EFAULT;
922 
923  ret = 0;
924  if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP))
925  ret = -EINVAL;
926  else
927  q->flags = u;
928 
929  return ret;
930 
931  case TUNGETIFF:
932  rcu_read_lock_bh();
933  vlan = rcu_dereference_bh(q->vlan);
934  if (vlan)
935  dev_hold(vlan->dev);
936  rcu_read_unlock_bh();
937 
938  if (!vlan)
939  return -ENOLINK;
940 
941  ret = 0;
942  if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) ||
943  put_user(q->flags, &ifr->ifr_flags))
944  ret = -EFAULT;
945  dev_put(vlan->dev);
946  return ret;
947 
948  case TUNGETFEATURES:
949  if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up))
950  return -EFAULT;
951  return 0;
952 
953  case TUNSETSNDBUF:
954  if (get_user(u, up))
955  return -EFAULT;
956 
957  q->sk.sk_sndbuf = u;
958  return 0;
959 
960  case TUNGETVNETHDRSZ:
961  s = q->vnet_hdr_sz;
962  if (put_user(s, sp))
963  return -EFAULT;
964  return 0;
965 
966  case TUNSETVNETHDRSZ:
967  if (get_user(s, sp))
968  return -EFAULT;
969  if (s < (int)sizeof(struct virtio_net_hdr))
970  return -EINVAL;
971 
972  q->vnet_hdr_sz = s;
973  return 0;
974 
975  case TUNSETOFFLOAD:
976  /* let the user check for future flags */
977  if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
979  return -EINVAL;
980 
981  /* TODO: only accept frames with the features that
982  got enabled for forwarded frames */
983  if (!(q->flags & IFF_VNET_HDR))
984  return -EINVAL;
985  return 0;
986 
987  default:
988  return -EINVAL;
989  }
990 }
991 
992 #ifdef CONFIG_COMPAT
993 static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
994  unsigned long arg)
995 {
996  return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
997 }
998 #endif
999 
1000 static const struct file_operations macvtap_fops = {
1001  .owner = THIS_MODULE,
1002  .open = macvtap_open,
1003  .release = macvtap_release,
1004  .aio_read = macvtap_aio_read,
1005  .aio_write = macvtap_aio_write,
1006  .poll = macvtap_poll,
1007  .llseek = no_llseek,
1008  .unlocked_ioctl = macvtap_ioctl,
1009 #ifdef CONFIG_COMPAT
1010  .compat_ioctl = macvtap_compat_ioctl,
1011 #endif
1012 };
1013 
1014 static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
1015  struct msghdr *m, size_t total_len)
1016 {
1017  struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
1018  return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen,
1019  m->msg_flags & MSG_DONTWAIT);
1020 }
1021 
1022 static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
1023  struct msghdr *m, size_t total_len,
1024  int flags)
1025 {
1026  struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
1027  int ret;
1028  if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
1029  return -EINVAL;
1030  ret = macvtap_do_read(q, iocb, m->msg_iov, total_len,
1031  flags & MSG_DONTWAIT);
1032  if (ret > total_len) {
1033  m->msg_flags |= MSG_TRUNC;
1034  ret = flags & MSG_TRUNC ? ret : total_len;
1035  }
1036  return ret;
1037 }
1038 
1039 /* Ops structure to mimic raw sockets with tun */
1040 static const struct proto_ops macvtap_socket_ops = {
1041  .sendmsg = macvtap_sendmsg,
1042  .recvmsg = macvtap_recvmsg,
1043 };
1044 
1045 /* Get an underlying socket object from tun file. Returns error unless file is
1046  * attached to a device. The returned object works like a packet socket, it
1047  * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for
1048  * holding a reference to the file for as long as the socket is in use. */
1049 struct socket *macvtap_get_socket(struct file *file)
1050 {
1051  struct macvtap_queue *q;
1052  if (file->f_op != &macvtap_fops)
1053  return ERR_PTR(-EINVAL);
1054  q = file->private_data;
1055  if (!q)
1056  return ERR_PTR(-EBADFD);
1057  return &q->sock;
1058 }
1060 
1061 static int macvtap_device_event(struct notifier_block *unused,
1062  unsigned long event, void *ptr)
1063 {
1064  struct net_device *dev = ptr;
1065  struct macvlan_dev *vlan;
1066  struct device *classdev;
1067  dev_t devt;
1068  int err;
1069 
1070  if (dev->rtnl_link_ops != &macvtap_link_ops)
1071  return NOTIFY_DONE;
1072 
1073  vlan = netdev_priv(dev);
1074 
1075  switch (event) {
1076  case NETDEV_REGISTER:
1077  /* Create the device node here after the network device has
1078  * been registered but before register_netdevice has
1079  * finished running.
1080  */
1081  err = macvtap_get_minor(vlan);
1082  if (err)
1083  return notifier_from_errno(err);
1084 
1085  devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
1086  classdev = device_create(macvtap_class, &dev->dev, devt,
1087  dev, "tap%d", dev->ifindex);
1088  if (IS_ERR(classdev)) {
1089  macvtap_free_minor(vlan);
1090  return notifier_from_errno(PTR_ERR(classdev));
1091  }
1092  break;
1093  case NETDEV_UNREGISTER:
1094  devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
1095  device_destroy(macvtap_class, devt);
1096  macvtap_free_minor(vlan);
1097  break;
1098  }
1099 
1100  return NOTIFY_DONE;
1101 }
1102 
1103 static struct notifier_block macvtap_notifier_block __read_mostly = {
1104  .notifier_call = macvtap_device_event,
1105 };
1106 
1107 static int macvtap_init(void)
1108 {
1109  int err;
1110 
1111  err = alloc_chrdev_region(&macvtap_major, 0,
1112  MACVTAP_NUM_DEVS, "macvtap");
1113  if (err)
1114  goto out1;
1115 
1116  cdev_init(&macvtap_cdev, &macvtap_fops);
1117  err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
1118  if (err)
1119  goto out2;
1120 
1121  macvtap_class = class_create(THIS_MODULE, "macvtap");
1122  if (IS_ERR(macvtap_class)) {
1123  err = PTR_ERR(macvtap_class);
1124  goto out3;
1125  }
1126 
1127  err = register_netdevice_notifier(&macvtap_notifier_block);
1128  if (err)
1129  goto out4;
1130 
1131  err = macvlan_link_register(&macvtap_link_ops);
1132  if (err)
1133  goto out5;
1134 
1135  return 0;
1136 
1137 out5:
1138  unregister_netdevice_notifier(&macvtap_notifier_block);
1139 out4:
1140  class_unregister(macvtap_class);
1141 out3:
1142  cdev_del(&macvtap_cdev);
1143 out2:
1145 out1:
1146  return err;
1147 }
1148 module_init(macvtap_init);
1149 
1150 static void macvtap_exit(void)
1151 {
1152  rtnl_link_unregister(&macvtap_link_ops);
1153  unregister_netdevice_notifier(&macvtap_notifier_block);
1154  class_unregister(macvtap_class);
1155  cdev_del(&macvtap_cdev);
1157 }
1158 module_exit(macvtap_exit);
1159 
1160 MODULE_ALIAS_RTNL_LINK("macvtap");
1161 MODULE_AUTHOR("Arnd Bergmann <[email protected]>");
1162 MODULE_LICENSE("GPL");