Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dev.c
Go to the documentation of this file.
1 /*
2  * NET3 Protocol independent device support routines.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  *
9  * Derived from the non IP parts of dev.c 1.0.19
10  * Authors: Ross Biro
11  * Fred N. van Kempen, <[email protected]>
12  * Mark Evans, <[email protected]>
13  *
14  * Additional Authors:
15  * Florian la Roche <[email protected]>
16  * Alan Cox <[email protected]>
17  * David Hinds <[email protected]>
18  * Alexey Kuznetsov <[email protected]>
19  * Adam Sulmicki <[email protected]>
20  * Pekka Riikonen <[email protected]>
21  *
22  * Changes:
23  * D.J. Barrow : Fixed bug where dev->refcnt gets set
24  * to 2 if register_netdev gets called
25  * before net_dev_init & also removed a
26  * few lines of code in the process.
27  * Alan Cox : device private ioctl copies fields back.
28  * Alan Cox : Transmit queue code does relevant
29  * stunts to keep the queue safe.
30  * Alan Cox : Fixed double lock.
31  * Alan Cox : Fixed promisc NULL pointer trap
32  * ???????? : Support the full private ioctl range
33  * Alan Cox : Moved ioctl permission check into
34  * drivers
35  * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36  * Alan Cox : 100 backlog just doesn't cut it when
37  * you start doing multicast video 8)
38  * Alan Cox : Rewrote net_bh and list manager.
39  * Alan Cox : Fix ETH_P_ALL echoback lengths.
40  * Alan Cox : Took out transmit every packet pass
41  * Saved a few bytes in the ioctl handler
42  * Alan Cox : Network driver sets packet type before
43  * calling netif_rx. Saves a function
44  * call a packet.
45  * Alan Cox : Hashed net_bh()
46  * Richard Kooijman: Timestamp fixes.
47  * Alan Cox : Wrong field in SIOCGIFDSTADDR
48  * Alan Cox : Device lock protection.
49  * Alan Cox : Fixed nasty side effect of device close
50  * changes.
51  * Rudi Cilibrasi : Pass the right thing to
52  * set_mac_address()
53  * Dave Miller : 32bit quantity for the device lock to
54  * make it work out on a Sparc.
55  * Bjorn Ekwall : Added KERNELD hack.
56  * Alan Cox : Cleaned up the backlog initialise.
57  * Craig Metz : SIOCGIFCONF fix if space for under
58  * 1 device.
59  * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60  * is no device open function.
61  * Andi Kleen : Fix error reporting for SIOCGIFCONF
62  * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63  * Cyrus Durgin : Cleaned for KMOD
64  * Adam Sulmicki : Bug Fix : Network Device Unload
65  * A network device unload needs to purge
66  * the backlog queue.
67  * Paul Rusty Russell : SIOCSIFNAME
68  * Pekka Riikonen : Netdev boot-time settings code
69  * Andrew Morton : Make unregister_netdevice wait
70  * indefinitely on dev->refcnt
71  * J Hadi Salim : - Backlog queue sampling
72  * - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/net_tstamp.h>
136 #include <linux/static_key.h>
137 #include <net/flow_keys.h>
138 
139 #include "net-sysfs.h"
140 
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143 
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 
147 /*
148  * The list of packet types we will receive (as opposed to discard)
149  * and the routines to invoke.
150  *
151  * Why 16. Because with 16 the only overlap we get on a hash of the
152  * low nibble of the protocol value is RARP/SNAP/X.25.
153  *
154  * NOTE: That is no longer true with the addition of VLAN tags. Not
155  * sure which should go first, but I bet it won't make much
156  * difference if we are running VLANs. The good news is that
157  * this protocol won't be in the list unless compiled in, so
158  * the average user (w/out VLANs) will not be adversely affected.
159  * --BLG
160  *
161  * 0800 IP
162  * 8100 802.1Q VLAN
163  * 0001 802.3
164  * 0002 AX.25
165  * 0004 802.2
166  * 8035 RARP
167  * 0005 SNAP
168  * 0805 X.25
169  * 0806 ARP
170  * 8137 IPX
171  * 0009 Localtalk
172  * 86DD IPv6
173  */
174 
175 #define PTYPE_HASH_SIZE (16)
176 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177 
178 static DEFINE_SPINLOCK(ptype_lock);
179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
180 static struct list_head ptype_all __read_mostly; /* Taps */
181 
182 /*
183  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184  * semaphore.
185  *
186  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187  *
188  * Writers must hold the rtnl semaphore while they loop through the
189  * dev_base_head list, and hold dev_base_lock for writing when they do the
190  * actual updates. This allows pure readers to access the list even
191  * while a writer is preparing to update it.
192  *
193  * To put it another way, dev_base_lock is held for writing only to
194  * protect against pure readers; the rtnl semaphore provides the
195  * protection against other writers.
196  *
197  * See, for example usages, register_netdevice() and
198  * unregister_netdevice(), which must be called with the rtnl
199  * semaphore held.
200  */
201 DEFINE_RWLOCK(dev_base_lock);
202 EXPORT_SYMBOL(dev_base_lock);
203 
204 static inline void dev_base_seq_inc(struct net *net)
205 {
206  while (++net->dev_base_seq == 0);
207 }
208 
209 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
210 {
211  unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
212 
213  return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
214 }
215 
216 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
217 {
218  return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
219 }
220 
221 static inline void rps_lock(struct softnet_data *sd)
222 {
223 #ifdef CONFIG_RPS
224  spin_lock(&sd->input_pkt_queue.lock);
225 #endif
226 }
227 
228 static inline void rps_unlock(struct softnet_data *sd)
229 {
230 #ifdef CONFIG_RPS
231  spin_unlock(&sd->input_pkt_queue.lock);
232 #endif
233 }
234 
235 /* Device list insertion */
236 static int list_netdevice(struct net_device *dev)
237 {
238  struct net *net = dev_net(dev);
239 
240  ASSERT_RTNL();
241 
242  write_lock_bh(&dev_base_lock);
243  list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
244  hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
245  hlist_add_head_rcu(&dev->index_hlist,
246  dev_index_hash(net, dev->ifindex));
247  write_unlock_bh(&dev_base_lock);
248 
249  dev_base_seq_inc(net);
250 
251  return 0;
252 }
253 
254 /* Device list removal
255  * caller must respect a RCU grace period before freeing/reusing dev
256  */
257 static void unlist_netdevice(struct net_device *dev)
258 {
259  ASSERT_RTNL();
260 
261  /* Unlink dev from the device chain */
262  write_lock_bh(&dev_base_lock);
263  list_del_rcu(&dev->dev_list);
264  hlist_del_rcu(&dev->name_hlist);
265  hlist_del_rcu(&dev->index_hlist);
266  write_unlock_bh(&dev_base_lock);
267 
268  dev_base_seq_inc(dev_net(dev));
269 }
270 
271 /*
272  * Our notifier list
273  */
274 
275 static RAW_NOTIFIER_HEAD(netdev_chain);
276 
277 /*
278  * Device drivers call our routines to queue packets here. We empty the
279  * queue in the local softnet handler.
280  */
281 
284 
285 #ifdef CONFIG_LOCKDEP
286 /*
287  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
288  * according to dev->type
289  */
290 static const unsigned short netdev_lock_type[] =
306 
307 static const char *const netdev_lock_name[] =
308  {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309  "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310  "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311  "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312  "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313  "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314  "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315  "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316  "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317  "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318  "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319  "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
320  "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
321  "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
322  "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
323 
324 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
325 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
326 
327 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
328 {
329  int i;
330 
331  for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
332  if (netdev_lock_type[i] == dev_type)
333  return i;
334  /* the last key is used by default */
335  return ARRAY_SIZE(netdev_lock_type) - 1;
336 }
337 
338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339  unsigned short dev_type)
340 {
341  int i;
342 
343  i = netdev_lock_pos(dev_type);
344  lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
345  netdev_lock_name[i]);
346 }
347 
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350  int i;
351 
352  i = netdev_lock_pos(dev->type);
354  &netdev_addr_lock_key[i],
355  netdev_lock_name[i]);
356 }
357 #else
358 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
359  unsigned short dev_type)
360 {
361 }
362 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
363 {
364 }
365 #endif
366 
367 /*******************************************************************************
368 
369  Protocol management and registration routines
370 
371 *******************************************************************************/
372 
373 /*
374  * Add a protocol ID to the list. Now that the input handler is
375  * smarter we can dispense with all the messy stuff that used to be
376  * here.
377  *
378  * BEWARE!!! Protocol handlers, mangling input packets,
379  * MUST BE last in hash buckets and checking protocol handlers
380  * MUST start from promiscuous ptype_all chain in net_bh.
381  * It is true now, do not change it.
382  * Explanation follows: if protocol handler, mangling packet, will
383  * be the first on list, it is not able to sense, that packet
384  * is cloned and should be copied-on-write, so that it will
385  * change it and subsequent readers will get broken packet.
386  * --ANK (980803)
387  */
388 
389 static inline struct list_head *ptype_head(const struct packet_type *pt)
390 {
391  if (pt->type == htons(ETH_P_ALL))
392  return &ptype_all;
393  else
394  return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
395 }
396 
410 void dev_add_pack(struct packet_type *pt)
411 {
412  struct list_head *head = ptype_head(pt);
413 
414  spin_lock(&ptype_lock);
415  list_add_rcu(&pt->list, head);
416  spin_unlock(&ptype_lock);
417 }
419 
434 {
435  struct list_head *head = ptype_head(pt);
436  struct packet_type *pt1;
437 
438  spin_lock(&ptype_lock);
439 
440  list_for_each_entry(pt1, head, list) {
441  if (pt == pt1) {
442  list_del_rcu(&pt->list);
443  goto out;
444  }
445  }
446 
447  pr_warn("dev_remove_pack: %p not found\n", pt);
448 out:
449  spin_unlock(&ptype_lock);
450 }
452 
465 void dev_remove_pack(struct packet_type *pt)
466 {
467  __dev_remove_pack(pt);
468 
469  synchronize_net();
470 }
472 
473 /******************************************************************************
474 
475  Device Boot-time Settings Routines
476 
477 *******************************************************************************/
478 
479 /* Boot time configuration table */
480 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
481 
491 static int netdev_boot_setup_add(char *name, struct ifmap *map)
492 {
493  struct netdev_boot_setup *s;
494  int i;
495 
496  s = dev_boot_setup;
497  for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498  if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
499  memset(s[i].name, 0, sizeof(s[i].name));
500  strlcpy(s[i].name, name, IFNAMSIZ);
501  memcpy(&s[i].map, map, sizeof(s[i].map));
502  break;
503  }
504  }
505 
506  return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
507 }
508 
519 {
520  struct netdev_boot_setup *s = dev_boot_setup;
521  int i;
522 
523  for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
524  if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
525  !strcmp(dev->name, s[i].name)) {
526  dev->irq = s[i].map.irq;
527  dev->base_addr = s[i].map.base_addr;
528  dev->mem_start = s[i].map.mem_start;
529  dev->mem_end = s[i].map.mem_end;
530  return 1;
531  }
532  }
533  return 0;
534 }
536 
537 
548 unsigned long netdev_boot_base(const char *prefix, int unit)
549 {
550  const struct netdev_boot_setup *s = dev_boot_setup;
551  char name[IFNAMSIZ];
552  int i;
553 
554  sprintf(name, "%s%d", prefix, unit);
555 
556  /*
557  * If device already registered then return base of 1
558  * to indicate not to probe for this interface
559  */
560  if (__dev_get_by_name(&init_net, name))
561  return 1;
562 
563  for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
564  if (!strcmp(name, s[i].name))
565  return s[i].map.base_addr;
566  return 0;
567 }
568 
569 /*
570  * Saves at boot time configured settings for any netdevice.
571  */
573 {
574  int ints[5];
575  struct ifmap map;
576 
577  str = get_options(str, ARRAY_SIZE(ints), ints);
578  if (!str || !*str)
579  return 0;
580 
581  /* Save settings */
582  memset(&map, 0, sizeof(map));
583  if (ints[0] > 0)
584  map.irq = ints[1];
585  if (ints[0] > 1)
586  map.base_addr = ints[2];
587  if (ints[0] > 2)
588  map.mem_start = ints[3];
589  if (ints[0] > 3)
590  map.mem_end = ints[4];
591 
592  /* Add new entry to the list */
593  return netdev_boot_setup_add(str, &map);
594 }
595 
596 __setup("netdev=", netdev_boot_setup);
597 
598 /*******************************************************************************
599 
600  Device Interface Subroutines
601 
602 *******************************************************************************/
603 
616 struct net_device *__dev_get_by_name(struct net *net, const char *name)
617 {
618  struct hlist_node *p;
619  struct net_device *dev;
620  struct hlist_head *head = dev_name_hash(net, name);
621 
622  hlist_for_each_entry(dev, p, head, name_hlist)
623  if (!strncmp(dev->name, name, IFNAMSIZ))
624  return dev;
625 
626  return NULL;
627 }
629 
642 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
643 {
644  struct hlist_node *p;
645  struct net_device *dev;
646  struct hlist_head *head = dev_name_hash(net, name);
647 
648  hlist_for_each_entry_rcu(dev, p, head, name_hlist)
649  if (!strncmp(dev->name, name, IFNAMSIZ))
650  return dev;
651 
652  return NULL;
653 }
655 
668 struct net_device *dev_get_by_name(struct net *net, const char *name)
669 {
670  struct net_device *dev;
671 
672  rcu_read_lock();
673  dev = dev_get_by_name_rcu(net, name);
674  if (dev)
675  dev_hold(dev);
676  rcu_read_unlock();
677  return dev;
678 }
680 
693 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
694 {
695  struct hlist_node *p;
696  struct net_device *dev;
697  struct hlist_head *head = dev_index_hash(net, ifindex);
698 
699  hlist_for_each_entry(dev, p, head, index_hlist)
700  if (dev->ifindex == ifindex)
701  return dev;
702 
703  return NULL;
704 }
706 
718 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
719 {
720  struct hlist_node *p;
721  struct net_device *dev;
722  struct hlist_head *head = dev_index_hash(net, ifindex);
723 
724  hlist_for_each_entry_rcu(dev, p, head, index_hlist)
725  if (dev->ifindex == ifindex)
726  return dev;
727 
728  return NULL;
729 }
731 
732 
744 struct net_device *dev_get_by_index(struct net *net, int ifindex)
745 {
746  struct net_device *dev;
747 
748  rcu_read_lock();
749  dev = dev_get_by_index_rcu(net, ifindex);
750  if (dev)
751  dev_hold(dev);
752  rcu_read_unlock();
753  return dev;
754 }
756 
771 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
772  const char *ha)
773 {
774  struct net_device *dev;
775 
776  for_each_netdev_rcu(net, dev)
777  if (dev->type == type &&
778  !memcmp(dev->dev_addr, ha, dev->addr_len))
779  return dev;
780 
781  return NULL;
782 }
784 
785 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
786 {
787  struct net_device *dev;
788 
789  ASSERT_RTNL();
790  for_each_netdev(net, dev)
791  if (dev->type == type)
792  return dev;
793 
794  return NULL;
795 }
797 
798 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
799 {
800  struct net_device *dev, *ret = NULL;
801 
802  rcu_read_lock();
803  for_each_netdev_rcu(net, dev)
804  if (dev->type == type) {
805  dev_hold(dev);
806  ret = dev;
807  break;
808  }
809  rcu_read_unlock();
810  return ret;
811 }
813 
825 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
826  unsigned short mask)
827 {
828  struct net_device *dev, *ret;
829 
830  ret = NULL;
831  for_each_netdev_rcu(net, dev) {
832  if (((dev->flags ^ if_flags) & mask) == 0) {
833  ret = dev;
834  break;
835  }
836  }
837  return ret;
838 }
840 
849 bool dev_valid_name(const char *name)
850 {
851  if (*name == '\0')
852  return false;
853  if (strlen(name) >= IFNAMSIZ)
854  return false;
855  if (!strcmp(name, ".") || !strcmp(name, ".."))
856  return false;
857 
858  while (*name) {
859  if (*name == '/' || isspace(*name))
860  return false;
861  name++;
862  }
863  return true;
864 }
866 
882 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
883 {
884  int i = 0;
885  const char *p;
886  const int max_netdevices = 8*PAGE_SIZE;
887  unsigned long *inuse;
888  struct net_device *d;
889 
890  p = strnchr(name, IFNAMSIZ-1, '%');
891  if (p) {
892  /*
893  * Verify the string as this thing may have come from
894  * the user. There must be either one "%d" and no other "%"
895  * characters.
896  */
897  if (p[1] != 'd' || strchr(p + 2, '%'))
898  return -EINVAL;
899 
900  /* Use one page as a bit array of possible slots */
901  inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
902  if (!inuse)
903  return -ENOMEM;
904 
905  for_each_netdev(net, d) {
906  if (!sscanf(d->name, name, &i))
907  continue;
908  if (i < 0 || i >= max_netdevices)
909  continue;
910 
911  /* avoid cases where sscanf is not exact inverse of printf */
912  snprintf(buf, IFNAMSIZ, name, i);
913  if (!strncmp(buf, d->name, IFNAMSIZ))
914  set_bit(i, inuse);
915  }
916 
917  i = find_first_zero_bit(inuse, max_netdevices);
918  free_page((unsigned long) inuse);
919  }
920 
921  if (buf != name)
922  snprintf(buf, IFNAMSIZ, name, i);
923  if (!__dev_get_by_name(net, buf))
924  return i;
925 
926  /* It is possible to run out of possible slots
927  * when the name is long and there isn't enough space left
928  * for the digits, or if all bits are used.
929  */
930  return -ENFILE;
931 }
932 
947 int dev_alloc_name(struct net_device *dev, const char *name)
948 {
949  char buf[IFNAMSIZ];
950  struct net *net;
951  int ret;
952 
953  BUG_ON(!dev_net(dev));
954  net = dev_net(dev);
955  ret = __dev_alloc_name(net, name, buf);
956  if (ret >= 0)
957  strlcpy(dev->name, buf, IFNAMSIZ);
958  return ret;
959 }
961 
962 static int dev_alloc_name_ns(struct net *net,
963  struct net_device *dev,
964  const char *name)
965 {
966  char buf[IFNAMSIZ];
967  int ret;
968 
969  ret = __dev_alloc_name(net, name, buf);
970  if (ret >= 0)
971  strlcpy(dev->name, buf, IFNAMSIZ);
972  return ret;
973 }
974 
975 static int dev_get_valid_name(struct net *net,
976  struct net_device *dev,
977  const char *name)
978 {
979  BUG_ON(!net);
980 
981  if (!dev_valid_name(name))
982  return -EINVAL;
983 
984  if (strchr(name, '%'))
985  return dev_alloc_name_ns(net, dev, name);
986  else if (__dev_get_by_name(net, name))
987  return -EEXIST;
988  else if (dev->name != name)
989  strlcpy(dev->name, name, IFNAMSIZ);
990 
991  return 0;
992 }
993 
1002 int dev_change_name(struct net_device *dev, const char *newname)
1003 {
1004  char oldname[IFNAMSIZ];
1005  int err = 0;
1006  int ret;
1007  struct net *net;
1008 
1009  ASSERT_RTNL();
1010  BUG_ON(!dev_net(dev));
1011 
1012  net = dev_net(dev);
1013  if (dev->flags & IFF_UP)
1014  return -EBUSY;
1015 
1016  if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1017  return 0;
1018 
1019  memcpy(oldname, dev->name, IFNAMSIZ);
1020 
1021  err = dev_get_valid_name(net, dev, newname);
1022  if (err < 0)
1023  return err;
1024 
1025 rollback:
1026  ret = device_rename(&dev->dev, dev->name);
1027  if (ret) {
1028  memcpy(dev->name, oldname, IFNAMSIZ);
1029  return ret;
1030  }
1031 
1033  hlist_del_rcu(&dev->name_hlist);
1035 
1036  synchronize_rcu();
1037 
1039  hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1041 
1043  ret = notifier_to_errno(ret);
1044 
1045  if (ret) {
1046  /* err >= 0 after dev_alloc_name() or stores the first errno */
1047  if (err >= 0) {
1048  err = ret;
1049  memcpy(dev->name, oldname, IFNAMSIZ);
1050  goto rollback;
1051  } else {
1052  pr_err("%s: name change rollback failed: %d\n",
1053  dev->name, ret);
1054  }
1055  }
1056 
1057  return err;
1058 }
1059 
1068 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1069 {
1070  char *new_ifalias;
1071 
1072  ASSERT_RTNL();
1073 
1074  if (len >= IFALIASZ)
1075  return -EINVAL;
1076 
1077  if (!len) {
1078  if (dev->ifalias) {
1079  kfree(dev->ifalias);
1080  dev->ifalias = NULL;
1081  }
1082  return 0;
1083  }
1084 
1085  new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1086  if (!new_ifalias)
1087  return -ENOMEM;
1088  dev->ifalias = new_ifalias;
1089 
1090  strlcpy(dev->ifalias, alias, len+1);
1091  return len;
1092 }
1093 
1094 
1102 {
1104 }
1106 
1116 {
1117  if (dev->flags & IFF_UP) {
1119  rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1120  }
1121 }
1123 
1135 {
1136  rtnl_lock();
1138  rtnl_unlock();
1139 }
1141 
1152 void dev_load(struct net *net, const char *name)
1153 {
1154  struct net_device *dev;
1155  int no_module;
1156 
1157  rcu_read_lock();
1158  dev = dev_get_by_name_rcu(net, name);
1159  rcu_read_unlock();
1160 
1161  no_module = !dev;
1162  if (no_module && capable(CAP_NET_ADMIN))
1163  no_module = request_module("netdev-%s", name);
1164  if (no_module && capable(CAP_SYS_MODULE)) {
1165  if (!request_module("%s", name))
1166  pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1167  name);
1168  }
1169 }
1170 EXPORT_SYMBOL(dev_load);
1171 
1172 static int __dev_open(struct net_device *dev)
1173 {
1174  const struct net_device_ops *ops = dev->netdev_ops;
1175  int ret;
1176 
1177  ASSERT_RTNL();
1178 
1179  if (!netif_device_present(dev))
1180  return -ENODEV;
1181 
1183  ret = notifier_to_errno(ret);
1184  if (ret)
1185  return ret;
1186 
1188 
1189  if (ops->ndo_validate_addr)
1190  ret = ops->ndo_validate_addr(dev);
1191 
1192  if (!ret && ops->ndo_open)
1193  ret = ops->ndo_open(dev);
1194 
1195  if (ret)
1197  else {
1198  dev->flags |= IFF_UP;
1199  net_dmaengine_get();
1200  dev_set_rx_mode(dev);
1201  dev_activate(dev);
1203  }
1204 
1205  return ret;
1206 }
1207 
1220 int dev_open(struct net_device *dev)
1221 {
1222  int ret;
1223 
1224  if (dev->flags & IFF_UP)
1225  return 0;
1226 
1227  ret = __dev_open(dev);
1228  if (ret < 0)
1229  return ret;
1230 
1233 
1234  return ret;
1235 }
1236 EXPORT_SYMBOL(dev_open);
1237 
1238 static int __dev_close_many(struct list_head *head)
1239 {
1240  struct net_device *dev;
1241 
1242  ASSERT_RTNL();
1243  might_sleep();
1244 
1245  list_for_each_entry(dev, head, unreg_list) {
1247 
1249 
1250  /* Synchronize to scheduled poll. We cannot touch poll list, it
1251  * can be even on different cpu. So just clear netif_running().
1252  *
1253  * dev->stop() will invoke napi_disable() on all of it's
1254  * napi_struct instances on this device.
1255  */
1256  smp_mb__after_clear_bit(); /* Commit netif_running(). */
1257  }
1258 
1259  dev_deactivate_many(head);
1260 
1261  list_for_each_entry(dev, head, unreg_list) {
1262  const struct net_device_ops *ops = dev->netdev_ops;
1263 
1264  /*
1265  * Call the device specific close. This cannot fail.
1266  * Only if device is UP
1267  *
1268  * We allow it to be called even after a DETACH hot-plug
1269  * event.
1270  */
1271  if (ops->ndo_stop)
1272  ops->ndo_stop(dev);
1273 
1274  dev->flags &= ~IFF_UP;
1275  net_dmaengine_put();
1276  }
1277 
1278  return 0;
1279 }
1280 
1281 static int __dev_close(struct net_device *dev)
1282 {
1283  int retval;
1284  LIST_HEAD(single);
1285 
1286  list_add(&dev->unreg_list, &single);
1287  retval = __dev_close_many(&single);
1288  list_del(&single);
1289  return retval;
1290 }
1291 
1292 static int dev_close_many(struct list_head *head)
1293 {
1294  struct net_device *dev, *tmp;
1295  LIST_HEAD(tmp_list);
1296 
1297  list_for_each_entry_safe(dev, tmp, head, unreg_list)
1298  if (!(dev->flags & IFF_UP))
1299  list_move(&dev->unreg_list, &tmp_list);
1300 
1301  __dev_close_many(head);
1302 
1303  list_for_each_entry(dev, head, unreg_list) {
1306  }
1307 
1308  /* rollback_registered_many needs the complete original list */
1309  list_splice(&tmp_list, head);
1310  return 0;
1311 }
1312 
1322 int dev_close(struct net_device *dev)
1323 {
1324  if (dev->flags & IFF_UP) {
1325  LIST_HEAD(single);
1326 
1327  list_add(&dev->unreg_list, &single);
1328  dev_close_many(&single);
1329  list_del(&single);
1330  }
1331  return 0;
1332 }
1333 EXPORT_SYMBOL(dev_close);
1334 
1335 
1344 void dev_disable_lro(struct net_device *dev)
1345 {
1346  /*
1347  * If we're trying to disable lro on a vlan device
1348  * use the underlying physical device instead
1349  */
1350  if (is_vlan_dev(dev))
1351  dev = vlan_dev_real_dev(dev);
1352 
1353  dev->wanted_features &= ~NETIF_F_LRO;
1355 
1356  if (unlikely(dev->features & NETIF_F_LRO))
1357  netdev_WARN(dev, "failed to disable LRO!\n");
1358 }
1360 
1361 
1362 static int dev_boot_phase = 1;
1363 
1379 {
1380  struct net_device *dev;
1381  struct net_device *last;
1382  struct net *net;
1383  int err;
1384 
1385  rtnl_lock();
1386  err = raw_notifier_chain_register(&netdev_chain, nb);
1387  if (err)
1388  goto unlock;
1389  if (dev_boot_phase)
1390  goto unlock;
1391  for_each_net(net) {
1392  for_each_netdev(net, dev) {
1393  err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1394  err = notifier_to_errno(err);
1395  if (err)
1396  goto rollback;
1397 
1398  if (!(dev->flags & IFF_UP))
1399  continue;
1400 
1401  nb->notifier_call(nb, NETDEV_UP, dev);
1402  }
1403  }
1404 
1405 unlock:
1406  rtnl_unlock();
1407  return err;
1408 
1409 rollback:
1410  last = dev;
1411  for_each_net(net) {
1412  for_each_netdev(net, dev) {
1413  if (dev == last)
1414  goto outroll;
1415 
1416  if (dev->flags & IFF_UP) {
1417  nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1418  nb->notifier_call(nb, NETDEV_DOWN, dev);
1419  }
1420  nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1421  }
1422  }
1423 
1424 outroll:
1425  raw_notifier_chain_unregister(&netdev_chain, nb);
1426  goto unlock;
1427 }
1429 
1445 {
1446  struct net_device *dev;
1447  struct net *net;
1448  int err;
1449 
1450  rtnl_lock();
1451  err = raw_notifier_chain_unregister(&netdev_chain, nb);
1452  if (err)
1453  goto unlock;
1454 
1455  for_each_net(net) {
1456  for_each_netdev(net, dev) {
1457  if (dev->flags & IFF_UP) {
1458  nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1459  nb->notifier_call(nb, NETDEV_DOWN, dev);
1460  }
1461  nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1462  }
1463  }
1464 unlock:
1465  rtnl_unlock();
1466  return err;
1467 }
1469 
1479 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1480 {
1481  ASSERT_RTNL();
1482  return raw_notifier_call_chain(&netdev_chain, val, dev);
1483 }
1485 
1486 static struct static_key netstamp_needed __read_mostly;
1487 #ifdef HAVE_JUMP_LABEL
1488 /* We are not allowed to call static_key_slow_dec() from irq context
1489  * If net_disable_timestamp() is called from irq context, defer the
1490  * static_key_slow_dec() calls.
1491  */
1492 static atomic_t netstamp_needed_deferred;
1493 #endif
1494 
1496 {
1497 #ifdef HAVE_JUMP_LABEL
1498  int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1499 
1500  if (deferred) {
1501  while (--deferred)
1502  static_key_slow_dec(&netstamp_needed);
1503  return;
1504  }
1505 #endif
1506  WARN_ON(in_interrupt());
1507  static_key_slow_inc(&netstamp_needed);
1508 }
1510 
1512 {
1513 #ifdef HAVE_JUMP_LABEL
1514  if (in_interrupt()) {
1515  atomic_inc(&netstamp_needed_deferred);
1516  return;
1517  }
1518 #endif
1519  static_key_slow_dec(&netstamp_needed);
1520 }
1522 
1523 static inline void net_timestamp_set(struct sk_buff *skb)
1524 {
1525  skb->tstamp.tv64 = 0;
1526  if (static_key_false(&netstamp_needed))
1527  __net_timestamp(skb);
1528 }
1529 
1530 #define net_timestamp_check(COND, SKB) \
1531  if (static_key_false(&netstamp_needed)) { \
1532  if ((COND) && !(SKB)->tstamp.tv64) \
1533  __net_timestamp(SKB); \
1534  } \
1535 
1536 static int net_hwtstamp_validate(struct ifreq *ifr)
1537 {
1538  struct hwtstamp_config cfg;
1541  int tx_type_valid = 0;
1542  int rx_filter_valid = 0;
1543 
1544  if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1545  return -EFAULT;
1546 
1547  if (cfg.flags) /* reserved for future extensions */
1548  return -EINVAL;
1549 
1550  tx_type = cfg.tx_type;
1551  rx_filter = cfg.rx_filter;
1552 
1553  switch (tx_type) {
1554  case HWTSTAMP_TX_OFF:
1555  case HWTSTAMP_TX_ON:
1557  tx_type_valid = 1;
1558  break;
1559  }
1560 
1561  switch (rx_filter) {
1562  case HWTSTAMP_FILTER_NONE:
1563  case HWTSTAMP_FILTER_ALL:
1564  case HWTSTAMP_FILTER_SOME:
1577  rx_filter_valid = 1;
1578  break;
1579  }
1580 
1581  if (!tx_type_valid || !rx_filter_valid)
1582  return -ERANGE;
1583 
1584  return 0;
1585 }
1586 
1587 static inline bool is_skb_forwardable(struct net_device *dev,
1588  struct sk_buff *skb)
1589 {
1590  unsigned int len;
1591 
1592  if (!(dev->flags & IFF_UP))
1593  return false;
1594 
1595  len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1596  if (skb->len <= len)
1597  return true;
1598 
1599  /* if TSO is enabled, we don't care about the length as the packet
1600  * could be forwarded without being segmented before
1601  */
1602  if (skb_is_gso(skb))
1603  return true;
1604 
1605  return false;
1606 }
1607 
1626 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1627 {
1628  if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1629  if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1630  atomic_long_inc(&dev->rx_dropped);
1631  kfree_skb(skb);
1632  return NET_RX_DROP;
1633  }
1634  }
1635 
1636  skb_orphan(skb);
1637  nf_reset(skb);
1638 
1639  if (unlikely(!is_skb_forwardable(dev, skb))) {
1640  atomic_long_inc(&dev->rx_dropped);
1641  kfree_skb(skb);
1642  return NET_RX_DROP;
1643  }
1644  skb->skb_iif = 0;
1645  skb->dev = dev;
1646  skb_dst_drop(skb);
1647  skb->tstamp.tv64 = 0;
1648  skb->pkt_type = PACKET_HOST;
1649  skb->protocol = eth_type_trans(skb, dev);
1650  skb->mark = 0;
1651  secpath_reset(skb);
1652  nf_reset(skb);
1653  return netif_rx(skb);
1654 }
1656 
1657 static inline int deliver_skb(struct sk_buff *skb,
1658  struct packet_type *pt_prev,
1659  struct net_device *orig_dev)
1660 {
1661  if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1662  return -ENOMEM;
1663  atomic_inc(&skb->users);
1664  return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1665 }
1666 
1667 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1668 {
1669  if (!ptype->af_packet_priv || !skb->sk)
1670  return false;
1671 
1672  if (ptype->id_match)
1673  return ptype->id_match(ptype, skb->sk);
1674  else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1675  return true;
1676 
1677  return false;
1678 }
1679 
1680 /*
1681  * Support routine. Sends outgoing frames to any network
1682  * taps currently in use.
1683  */
1684 
1685 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1686 {
1687  struct packet_type *ptype;
1688  struct sk_buff *skb2 = NULL;
1689  struct packet_type *pt_prev = NULL;
1690 
1691  rcu_read_lock();
1692  list_for_each_entry_rcu(ptype, &ptype_all, list) {
1693  /* Never send packets back to the socket
1694  * they originated from - MvS ([email protected])
1695  */
1696  if ((ptype->dev == dev || !ptype->dev) &&
1697  (!skb_loop_sk(ptype, skb))) {
1698  if (pt_prev) {
1699  deliver_skb(skb2, pt_prev, skb->dev);
1700  pt_prev = ptype;
1701  continue;
1702  }
1703 
1704  skb2 = skb_clone(skb, GFP_ATOMIC);
1705  if (!skb2)
1706  break;
1707 
1708  net_timestamp_set(skb2);
1709 
1710  /* skb->nh should be correctly
1711  set by sender, so that the second statement is
1712  just protection against buggy protocols.
1713  */
1714  skb_reset_mac_header(skb2);
1715 
1716  if (skb_network_header(skb2) < skb2->data ||
1717  skb2->network_header > skb2->tail) {
1718  net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1719  ntohs(skb2->protocol),
1720  dev->name);
1721  skb_reset_network_header(skb2);
1722  }
1723 
1724  skb2->transport_header = skb2->network_header;
1725  skb2->pkt_type = PACKET_OUTGOING;
1726  pt_prev = ptype;
1727  }
1728  }
1729  if (pt_prev)
1730  pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1731  rcu_read_unlock();
1732 }
1733 
1747 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1748 {
1749  int i;
1750  struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1751 
1752  /* If TC0 is invalidated disable TC mapping */
1753  if (tc->offset + tc->count > txq) {
1754  pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1755  dev->num_tc = 0;
1756  return;
1757  }
1758 
1759  /* Invalidated prio to tc mappings set to TC0 */
1760  for (i = 1; i < TC_BITMASK + 1; i++) {
1761  int q = netdev_get_prio_tc_map(dev, i);
1762 
1763  tc = &dev->tc_to_txq[q];
1764  if (tc->offset + tc->count > txq) {
1765  pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1766  i, q);
1767  netdev_set_prio_tc_map(dev, i, 0);
1768  }
1769  }
1770 }
1771 
1772 /*
1773  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1774  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1775  */
1776 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1777 {
1778  int rc;
1779 
1780  if (txq < 1 || txq > dev->num_tx_queues)
1781  return -EINVAL;
1782 
1783  if (dev->reg_state == NETREG_REGISTERED ||
1784  dev->reg_state == NETREG_UNREGISTERING) {
1785  ASSERT_RTNL();
1786 
1788  txq);
1789  if (rc)
1790  return rc;
1791 
1792  if (dev->num_tc)
1793  netif_setup_tc(dev, txq);
1794 
1795  if (txq < dev->real_num_tx_queues)
1796  qdisc_reset_all_tx_gt(dev, txq);
1797  }
1798 
1799  dev->real_num_tx_queues = txq;
1800  return 0;
1801 }
1803 
1804 #ifdef CONFIG_RPS
1805 
1815 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1816 {
1817  int rc;
1818 
1819  if (rxq < 1 || rxq > dev->num_rx_queues)
1820  return -EINVAL;
1821 
1822  if (dev->reg_state == NETREG_REGISTERED) {
1823  ASSERT_RTNL();
1824 
1825  rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1826  rxq);
1827  if (rc)
1828  return rc;
1829  }
1830 
1831  dev->real_num_rx_queues = rxq;
1832  return 0;
1833 }
1834 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1835 #endif
1836 
1844 {
1846 }
1848 
1849 static inline void __netif_reschedule(struct Qdisc *q)
1850 {
1851  struct softnet_data *sd;
1852  unsigned long flags;
1853 
1854  local_irq_save(flags);
1855  sd = &__get_cpu_var(softnet_data);
1856  q->next_sched = NULL;
1857  *sd->output_queue_tailp = q;
1858  sd->output_queue_tailp = &q->next_sched;
1860  local_irq_restore(flags);
1861 }
1862 
1863 void __netif_schedule(struct Qdisc *q)
1864 {
1866  __netif_reschedule(q);
1867 }
1869 
1870 void dev_kfree_skb_irq(struct sk_buff *skb)
1871 {
1872  if (atomic_dec_and_test(&skb->users)) {
1873  struct softnet_data *sd;
1874  unsigned long flags;
1875 
1876  local_irq_save(flags);
1877  sd = &__get_cpu_var(softnet_data);
1878  skb->next = sd->completion_queue;
1879  sd->completion_queue = skb;
1881  local_irq_restore(flags);
1882  }
1883 }
1885 
1886 void dev_kfree_skb_any(struct sk_buff *skb)
1887 {
1888  if (in_irq() || irqs_disabled())
1889  dev_kfree_skb_irq(skb);
1890  else
1891  dev_kfree_skb(skb);
1892 }
1894 
1895 
1903 {
1905  netif_running(dev)) {
1906  netif_tx_stop_all_queues(dev);
1907  }
1908 }
1910 
1918 {
1920  netif_running(dev)) {
1921  netif_tx_wake_all_queues(dev);
1922  __netdev_watchdog_up(dev);
1923  }
1924 }
1926 
1927 static void skb_warn_bad_offload(const struct sk_buff *skb)
1928 {
1929  static const netdev_features_t null_features = 0;
1930  struct net_device *dev = skb->dev;
1931  const char *driver = "";
1932 
1933  if (dev && dev->dev.parent)
1934  driver = dev_driver_string(dev->dev.parent);
1935 
1936  WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1937  "gso_type=%d ip_summed=%d\n",
1938  driver, dev ? &dev->features : &null_features,
1939  skb->sk ? &skb->sk->sk_route_caps : &null_features,
1940  skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1941  skb_shinfo(skb)->gso_type, skb->ip_summed);
1942 }
1943 
1944 /*
1945  * Invalidate hardware checksum when packet is to be mangled, and
1946  * complete checksum manually on outgoing path.
1947  */
1948 int skb_checksum_help(struct sk_buff *skb)
1949 {
1950  __wsum csum;
1951  int ret = 0, offset;
1952 
1953  if (skb->ip_summed == CHECKSUM_COMPLETE)
1954  goto out_set_summed;
1955 
1956  if (unlikely(skb_shinfo(skb)->gso_size)) {
1957  skb_warn_bad_offload(skb);
1958  return -EINVAL;
1959  }
1960 
1961  offset = skb_checksum_start_offset(skb);
1962  BUG_ON(offset >= skb_headlen(skb));
1963  csum = skb_checksum(skb, offset, skb->len - offset, 0);
1964 
1965  offset += skb->csum_offset;
1966  BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1967 
1968  if (skb_cloned(skb) &&
1969  !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1970  ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1971  if (ret)
1972  goto out;
1973  }
1974 
1975  *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1976 out_set_summed:
1977  skb->ip_summed = CHECKSUM_NONE;
1978 out:
1979  return ret;
1980 }
1982 
1993 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1995 {
1996  struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1997  struct packet_type *ptype;
1998  __be16 type = skb->protocol;
1999  int vlan_depth = ETH_HLEN;
2000  int err;
2001 
2002  while (type == htons(ETH_P_8021Q)) {
2003  struct vlan_hdr *vh;
2004 
2005  if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2006  return ERR_PTR(-EINVAL);
2007 
2008  vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2009  type = vh->h_vlan_encapsulated_proto;
2010  vlan_depth += VLAN_HLEN;
2011  }
2012 
2013  skb_reset_mac_header(skb);
2014  skb->mac_len = skb->network_header - skb->mac_header;
2015  __skb_pull(skb, skb->mac_len);
2016 
2017  if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2018  skb_warn_bad_offload(skb);
2019 
2020  if (skb_header_cloned(skb) &&
2021  (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2022  return ERR_PTR(err);
2023  }
2024 
2025  rcu_read_lock();
2026  list_for_each_entry_rcu(ptype,
2027  &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2028  if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
2029  if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2030  err = ptype->gso_send_check(skb);
2031  segs = ERR_PTR(err);
2032  if (err || skb_gso_ok(skb, features))
2033  break;
2034  __skb_push(skb, (skb->data -
2035  skb_network_header(skb)));
2036  }
2037  segs = ptype->gso_segment(skb, features);
2038  break;
2039  }
2040  }
2041  rcu_read_unlock();
2042 
2043  __skb_push(skb, skb->data - skb_mac_header(skb));
2044 
2045  return segs;
2046 }
2048 
2049 /* Take action when hardware reception checksum errors are detected. */
2050 #ifdef CONFIG_BUG
2051 void netdev_rx_csum_fault(struct net_device *dev)
2052 {
2053  if (net_ratelimit()) {
2054  pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2055  dump_stack();
2056  }
2057 }
2058 EXPORT_SYMBOL(netdev_rx_csum_fault);
2059 #endif
2060 
2061 /* Actually, we should eliminate this check as soon as we know, that:
2062  * 1. IOMMU is present and allows to map all the memory.
2063  * 2. No high memory really exists on this machine.
2064  */
2065 
2066 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2067 {
2068 #ifdef CONFIG_HIGHMEM
2069  int i;
2070  if (!(dev->features & NETIF_F_HIGHDMA)) {
2071  for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2072  skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2073  if (PageHighMem(skb_frag_page(frag)))
2074  return 1;
2075  }
2076  }
2077 
2078  if (PCI_DMA_BUS_IS_PHYS) {
2079  struct device *pdev = dev->dev.parent;
2080 
2081  if (!pdev)
2082  return 0;
2083  for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2084  skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2085  dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2086  if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2087  return 1;
2088  }
2089  }
2090 #endif
2091  return 0;
2092 }
2093 
2094 struct dev_gso_cb {
2096 };
2097 
2098 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2099 
2100 static void dev_gso_skb_destructor(struct sk_buff *skb)
2101 {
2102  struct dev_gso_cb *cb;
2103 
2104  do {
2105  struct sk_buff *nskb = skb->next;
2106 
2107  skb->next = nskb->next;
2108  nskb->next = NULL;
2109  kfree_skb(nskb);
2110  } while (skb->next);
2111 
2112  cb = DEV_GSO_CB(skb);
2113  if (cb->destructor)
2114  cb->destructor(skb);
2115 }
2116 
2125 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2126 {
2127  struct sk_buff *segs;
2128 
2129  segs = skb_gso_segment(skb, features);
2130 
2131  /* Verifying header integrity only. */
2132  if (!segs)
2133  return 0;
2134 
2135  if (IS_ERR(segs))
2136  return PTR_ERR(segs);
2137 
2138  skb->next = segs;
2139  DEV_GSO_CB(skb)->destructor = skb->destructor;
2140  skb->destructor = dev_gso_skb_destructor;
2141 
2142  return 0;
2143 }
2144 
2145 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2146 {
2147  return ((features & NETIF_F_GEN_CSUM) ||
2148  ((features & NETIF_F_V4_CSUM) &&
2149  protocol == htons(ETH_P_IP)) ||
2150  ((features & NETIF_F_V6_CSUM) &&
2151  protocol == htons(ETH_P_IPV6)) ||
2152  ((features & NETIF_F_FCOE_CRC) &&
2153  protocol == htons(ETH_P_FCOE)));
2154 }
2155 
2156 static netdev_features_t harmonize_features(struct sk_buff *skb,
2157  __be16 protocol, netdev_features_t features)
2158 {
2159  if (skb->ip_summed != CHECKSUM_NONE &&
2160  !can_checksum_protocol(features, protocol)) {
2161  features &= ~NETIF_F_ALL_CSUM;
2162  features &= ~NETIF_F_SG;
2163  } else if (illegal_highdma(skb->dev, skb)) {
2164  features &= ~NETIF_F_SG;
2165  }
2166 
2167  return features;
2168 }
2169 
2171 {
2172  __be16 protocol = skb->protocol;
2173  netdev_features_t features = skb->dev->features;
2174 
2175  if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2176  features &= ~NETIF_F_GSO_MASK;
2177 
2178  if (protocol == htons(ETH_P_8021Q)) {
2179  struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2180  protocol = veh->h_vlan_encapsulated_proto;
2181  } else if (!vlan_tx_tag_present(skb)) {
2182  return harmonize_features(skb, protocol, features);
2183  }
2184 
2185  features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2186 
2187  if (protocol != htons(ETH_P_8021Q)) {
2188  return harmonize_features(skb, protocol, features);
2189  } else {
2190  features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2191  NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2192  return harmonize_features(skb, protocol, features);
2193  }
2194 }
2196 
2197 /*
2198  * Returns true if either:
2199  * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2200  * 2. skb is fragmented and the device does not support SG.
2201  */
2202 static inline int skb_needs_linearize(struct sk_buff *skb,
2203  int features)
2204 {
2205  return skb_is_nonlinear(skb) &&
2206  ((skb_has_frag_list(skb) &&
2207  !(features & NETIF_F_FRAGLIST)) ||
2208  (skb_shinfo(skb)->nr_frags &&
2209  !(features & NETIF_F_SG)));
2210 }
2211 
2212 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2213  struct netdev_queue *txq)
2214 {
2215  const struct net_device_ops *ops = dev->netdev_ops;
2216  int rc = NETDEV_TX_OK;
2217  unsigned int skb_len;
2218 
2219  if (likely(!skb->next)) {
2221 
2222  /*
2223  * If device doesn't need skb->dst, release it right now while
2224  * its hot in this cpu cache
2225  */
2226  if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2227  skb_dst_drop(skb);
2228 
2229  features = netif_skb_features(skb);
2230 
2231  if (vlan_tx_tag_present(skb) &&
2232  !(features & NETIF_F_HW_VLAN_TX)) {
2233  skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2234  if (unlikely(!skb))
2235  goto out;
2236 
2237  skb->vlan_tci = 0;
2238  }
2239 
2240  if (netif_needs_gso(skb, features)) {
2241  if (unlikely(dev_gso_segment(skb, features)))
2242  goto out_kfree_skb;
2243  if (skb->next)
2244  goto gso;
2245  } else {
2246  if (skb_needs_linearize(skb, features) &&
2247  __skb_linearize(skb))
2248  goto out_kfree_skb;
2249 
2250  /* If packet is not checksummed and device does not
2251  * support checksumming for this protocol, complete
2252  * checksumming here.
2253  */
2254  if (skb->ip_summed == CHECKSUM_PARTIAL) {
2255  skb_set_transport_header(skb,
2256  skb_checksum_start_offset(skb));
2257  if (!(features & NETIF_F_ALL_CSUM) &&
2258  skb_checksum_help(skb))
2259  goto out_kfree_skb;
2260  }
2261  }
2262 
2263  if (!list_empty(&ptype_all))
2264  dev_queue_xmit_nit(skb, dev);
2265 
2266  skb_len = skb->len;
2267  rc = ops->ndo_start_xmit(skb, dev);
2268  trace_net_dev_xmit(skb, rc, dev, skb_len);
2269  if (rc == NETDEV_TX_OK)
2270  txq_trans_update(txq);
2271  return rc;
2272  }
2273 
2274 gso:
2275  do {
2276  struct sk_buff *nskb = skb->next;
2277 
2278  skb->next = nskb->next;
2279  nskb->next = NULL;
2280 
2281  /*
2282  * If device doesn't need nskb->dst, release it right now while
2283  * its hot in this cpu cache
2284  */
2285  if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2286  skb_dst_drop(nskb);
2287 
2288  if (!list_empty(&ptype_all))
2289  dev_queue_xmit_nit(nskb, dev);
2290 
2291  skb_len = nskb->len;
2292  rc = ops->ndo_start_xmit(nskb, dev);
2293  trace_net_dev_xmit(nskb, rc, dev, skb_len);
2294  if (unlikely(rc != NETDEV_TX_OK)) {
2295  if (rc & ~NETDEV_TX_MASK)
2296  goto out_kfree_gso_skb;
2297  nskb->next = skb->next;
2298  skb->next = nskb;
2299  return rc;
2300  }
2301  txq_trans_update(txq);
2302  if (unlikely(netif_xmit_stopped(txq) && skb->next))
2303  return NETDEV_TX_BUSY;
2304  } while (skb->next);
2305 
2306 out_kfree_gso_skb:
2307  if (likely(skb->next == NULL))
2308  skb->destructor = DEV_GSO_CB(skb)->destructor;
2309 out_kfree_skb:
2310  kfree_skb(skb);
2311 out:
2312  return rc;
2313 }
2314 
2315 static u32 hashrnd __read_mostly;
2316 
2317 /*
2318  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2319  * to be used as a distribution range.
2320  */
2321 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2322  unsigned int num_tx_queues)
2323 {
2324  u32 hash;
2325  u16 qoffset = 0;
2326  u16 qcount = num_tx_queues;
2327 
2328  if (skb_rx_queue_recorded(skb)) {
2329  hash = skb_get_rx_queue(skb);
2330  while (unlikely(hash >= num_tx_queues))
2331  hash -= num_tx_queues;
2332  return hash;
2333  }
2334 
2335  if (dev->num_tc) {
2336  u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2337  qoffset = dev->tc_to_txq[tc].offset;
2338  qcount = dev->tc_to_txq[tc].count;
2339  }
2340 
2341  if (skb->sk && skb->sk->sk_hash)
2342  hash = skb->sk->sk_hash;
2343  else
2344  hash = (__force u16) skb->protocol;
2345  hash = jhash_1word(hash, hashrnd);
2346 
2347  return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2348 }
2350 
2351 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2352 {
2353  if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2354  net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2355  dev->name, queue_index,
2356  dev->real_num_tx_queues);
2357  return 0;
2358  }
2359  return queue_index;
2360 }
2361 
2362 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2363 {
2364 #ifdef CONFIG_XPS
2365  struct xps_dev_maps *dev_maps;
2366  struct xps_map *map;
2367  int queue_index = -1;
2368 
2369  rcu_read_lock();
2370  dev_maps = rcu_dereference(dev->xps_maps);
2371  if (dev_maps) {
2372  map = rcu_dereference(
2373  dev_maps->cpu_map[raw_smp_processor_id()]);
2374  if (map) {
2375  if (map->len == 1)
2376  queue_index = map->queues[0];
2377  else {
2378  u32 hash;
2379  if (skb->sk && skb->sk->sk_hash)
2380  hash = skb->sk->sk_hash;
2381  else
2382  hash = (__force u16) skb->protocol ^
2383  skb->rxhash;
2384  hash = jhash_1word(hash, hashrnd);
2385  queue_index = map->queues[
2386  ((u64)hash * map->len) >> 32];
2387  }
2388  if (unlikely(queue_index >= dev->real_num_tx_queues))
2389  queue_index = -1;
2390  }
2391  }
2392  rcu_read_unlock();
2393 
2394  return queue_index;
2395 #else
2396  return -1;
2397 #endif
2398 }
2399 
2401  struct sk_buff *skb)
2402 {
2403  int queue_index;
2404  const struct net_device_ops *ops = dev->netdev_ops;
2405 
2406  if (dev->real_num_tx_queues == 1)
2407  queue_index = 0;
2408  else if (ops->ndo_select_queue) {
2409  queue_index = ops->ndo_select_queue(dev, skb);
2410  queue_index = dev_cap_txqueue(dev, queue_index);
2411  } else {
2412  struct sock *sk = skb->sk;
2413  queue_index = sk_tx_queue_get(sk);
2414 
2415  if (queue_index < 0 || skb->ooo_okay ||
2416  queue_index >= dev->real_num_tx_queues) {
2417  int old_index = queue_index;
2418 
2419  queue_index = get_xps_queue(dev, skb);
2420  if (queue_index < 0)
2421  queue_index = skb_tx_hash(dev, skb);
2422 
2423  if (queue_index != old_index && sk) {
2424  struct dst_entry *dst =
2426 
2427  if (dst && skb_dst(skb) == dst)
2428  sk_tx_queue_set(sk, queue_index);
2429  }
2430  }
2431  }
2432 
2433  skb_set_queue_mapping(skb, queue_index);
2434  return netdev_get_tx_queue(dev, queue_index);
2435 }
2436 
2437 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2438  struct net_device *dev,
2439  struct netdev_queue *txq)
2440 {
2441  spinlock_t *root_lock = qdisc_lock(q);
2442  bool contended;
2443  int rc;
2444 
2445  qdisc_skb_cb(skb)->pkt_len = skb->len;
2446  qdisc_calculate_pkt_len(skb, q);
2447  /*
2448  * Heuristic to force contended enqueues to serialize on a
2449  * separate lock before trying to get qdisc main lock.
2450  * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2451  * and dequeue packets faster.
2452  */
2453  contended = qdisc_is_running(q);
2454  if (unlikely(contended))
2455  spin_lock(&q->busylock);
2456 
2457  spin_lock(root_lock);
2459  kfree_skb(skb);
2460  rc = NET_XMIT_DROP;
2461  } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2462  qdisc_run_begin(q)) {
2463  /*
2464  * This is a work-conserving queue; there are no old skbs
2465  * waiting to be sent out; and the qdisc is not running -
2466  * xmit the skb directly.
2467  */
2468  if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2469  skb_dst_force(skb);
2470 
2471  qdisc_bstats_update(q, skb);
2472 
2473  if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2474  if (unlikely(contended)) {
2475  spin_unlock(&q->busylock);
2476  contended = false;
2477  }
2478  __qdisc_run(q);
2479  } else
2480  qdisc_run_end(q);
2481 
2482  rc = NET_XMIT_SUCCESS;
2483  } else {
2484  skb_dst_force(skb);
2485  rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2486  if (qdisc_run_begin(q)) {
2487  if (unlikely(contended)) {
2488  spin_unlock(&q->busylock);
2489  contended = false;
2490  }
2491  __qdisc_run(q);
2492  }
2493  }
2494  spin_unlock(root_lock);
2495  if (unlikely(contended))
2496  spin_unlock(&q->busylock);
2497  return rc;
2498 }
2499 
2500 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2501 static void skb_update_prio(struct sk_buff *skb)
2502 {
2503  struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2504 
2505  if (!skb->priority && skb->sk && map) {
2506  unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2507 
2508  if (prioidx < map->priomap_len)
2509  skb->priority = map->priomap[prioidx];
2510  }
2511 }
2512 #else
2513 #define skb_update_prio(skb)
2514 #endif
2515 
2516 static DEFINE_PER_CPU(int, xmit_recursion);
2517 #define RECURSION_LIMIT 10
2518 
2523 int dev_loopback_xmit(struct sk_buff *skb)
2524 {
2525  skb_reset_mac_header(skb);
2526  __skb_pull(skb, skb_network_offset(skb));
2527  skb->pkt_type = PACKET_LOOPBACK;
2529  WARN_ON(!skb_dst(skb));
2530  skb_dst_force(skb);
2531  netif_rx_ni(skb);
2532  return 0;
2533 }
2535 
2561 int dev_queue_xmit(struct sk_buff *skb)
2562 {
2563  struct net_device *dev = skb->dev;
2564  struct netdev_queue *txq;
2565  struct Qdisc *q;
2566  int rc = -ENOMEM;
2567 
2568  /* Disable soft irqs for various locks below. Also
2569  * stops preemption for RCU.
2570  */
2571  rcu_read_lock_bh();
2572 
2573  skb_update_prio(skb);
2574 
2575  txq = netdev_pick_tx(dev, skb);
2576  q = rcu_dereference_bh(txq->qdisc);
2577 
2578 #ifdef CONFIG_NET_CLS_ACT
2579  skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2580 #endif
2581  trace_net_dev_queue(skb);
2582  if (q->enqueue) {
2583  rc = __dev_xmit_skb(skb, q, dev, txq);
2584  goto out;
2585  }
2586 
2587  /* The device has no queue. Common case for software devices:
2588  loopback, all the sorts of tunnels...
2589 
2590  Really, it is unlikely that netif_tx_lock protection is necessary
2591  here. (f.e. loopback and IP tunnels are clean ignoring statistics
2592  counters.)
2593  However, it is possible, that they rely on protection
2594  made by us here.
2595 
2596  Check this and shot the lock. It is not prone from deadlocks.
2597  Either shot noqueue qdisc, it is even simpler 8)
2598  */
2599  if (dev->flags & IFF_UP) {
2600  int cpu = smp_processor_id(); /* ok because BHs are off */
2601 
2602  if (txq->xmit_lock_owner != cpu) {
2603 
2604  if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2605  goto recursion_alert;
2606 
2607  HARD_TX_LOCK(dev, txq, cpu);
2608 
2609  if (!netif_xmit_stopped(txq)) {
2610  __this_cpu_inc(xmit_recursion);
2611  rc = dev_hard_start_xmit(skb, dev, txq);
2612  __this_cpu_dec(xmit_recursion);
2613  if (dev_xmit_complete(rc)) {
2614  HARD_TX_UNLOCK(dev, txq);
2615  goto out;
2616  }
2617  }
2618  HARD_TX_UNLOCK(dev, txq);
2619  net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2620  dev->name);
2621  } else {
2622  /* Recursion is detected! It is possible,
2623  * unfortunately
2624  */
2625 recursion_alert:
2626  net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2627  dev->name);
2628  }
2629  }
2630 
2631  rc = -ENETDOWN;
2632  rcu_read_unlock_bh();
2633 
2634  kfree_skb(skb);
2635  return rc;
2636 out:
2637  rcu_read_unlock_bh();
2638  return rc;
2639 }
2641 
2642 
2643 /*=======================================================================
2644  Receiver routines
2645  =======================================================================*/
2646 
2647 int netdev_max_backlog __read_mostly = 1000;
2648 EXPORT_SYMBOL(netdev_max_backlog);
2649 
2650 int netdev_tstamp_prequeue __read_mostly = 1;
2651 int netdev_budget __read_mostly = 300;
2652 int weight_p __read_mostly = 64; /* old backlog weight */
2653 
2654 /* Called with irq disabled */
2655 static inline void ____napi_schedule(struct softnet_data *sd,
2656  struct napi_struct *napi)
2657 {
2658  list_add_tail(&napi->poll_list, &sd->poll_list);
2660 }
2661 
2662 /*
2663  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2664  * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2665  * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2666  * if hash is a canonical 4-tuple hash over transport ports.
2667  */
2668 void __skb_get_rxhash(struct sk_buff *skb)
2669 {
2670  struct flow_keys keys;
2671  u32 hash;
2672 
2673  if (!skb_flow_dissect(skb, &keys))
2674  return;
2675 
2676  if (keys.ports)
2677  skb->l4_rxhash = 1;
2678 
2679  /* get a consistent hash (same value on both flow directions) */
2680  if (((__force u32)keys.dst < (__force u32)keys.src) ||
2681  (((__force u32)keys.dst == (__force u32)keys.src) &&
2682  ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
2683  swap(keys.dst, keys.src);
2684  swap(keys.port16[0], keys.port16[1]);
2685  }
2686 
2687  hash = jhash_3words((__force u32)keys.dst,
2688  (__force u32)keys.src,
2689  (__force u32)keys.ports, hashrnd);
2690  if (!hash)
2691  hash = 1;
2692 
2693  skb->rxhash = hash;
2694 }
2696 
2697 #ifdef CONFIG_RPS
2698 
2699 /* One global table that all flow-based protocols share. */
2700 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2701 EXPORT_SYMBOL(rps_sock_flow_table);
2702 
2703 struct static_key rps_needed __read_mostly;
2704 
2705 static struct rps_dev_flow *
2706 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2707  struct rps_dev_flow *rflow, u16 next_cpu)
2708 {
2709  if (next_cpu != RPS_NO_CPU) {
2710 #ifdef CONFIG_RFS_ACCEL
2711  struct netdev_rx_queue *rxqueue;
2712  struct rps_dev_flow_table *flow_table;
2713  struct rps_dev_flow *old_rflow;
2714  u32 flow_id;
2715  u16 rxq_index;
2716  int rc;
2717 
2718  /* Should we steer this flow to a different hardware queue? */
2719  if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2720  !(dev->features & NETIF_F_NTUPLE))
2721  goto out;
2722  rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2723  if (rxq_index == skb_get_rx_queue(skb))
2724  goto out;
2725 
2726  rxqueue = dev->_rx + rxq_index;
2727  flow_table = rcu_dereference(rxqueue->rps_flow_table);
2728  if (!flow_table)
2729  goto out;
2730  flow_id = skb->rxhash & flow_table->mask;
2731  rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2732  rxq_index, flow_id);
2733  if (rc < 0)
2734  goto out;
2735  old_rflow = rflow;
2736  rflow = &flow_table->flows[flow_id];
2737  rflow->filter = rc;
2738  if (old_rflow->filter == rflow->filter)
2739  old_rflow->filter = RPS_NO_FILTER;
2740  out:
2741 #endif
2742  rflow->last_qtail =
2743  per_cpu(softnet_data, next_cpu).input_queue_head;
2744  }
2745 
2746  rflow->cpu = next_cpu;
2747  return rflow;
2748 }
2749 
2750 /*
2751  * get_rps_cpu is called from netif_receive_skb and returns the target
2752  * CPU from the RPS map of the receiving queue for a given skb.
2753  * rcu_read_lock must be held on entry.
2754  */
2755 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2756  struct rps_dev_flow **rflowp)
2757 {
2758  struct netdev_rx_queue *rxqueue;
2759  struct rps_map *map;
2760  struct rps_dev_flow_table *flow_table;
2761  struct rps_sock_flow_table *sock_flow_table;
2762  int cpu = -1;
2763  u16 tcpu;
2764 
2765  if (skb_rx_queue_recorded(skb)) {
2766  u16 index = skb_get_rx_queue(skb);
2767  if (unlikely(index >= dev->real_num_rx_queues)) {
2768  WARN_ONCE(dev->real_num_rx_queues > 1,
2769  "%s received packet on queue %u, but number "
2770  "of RX queues is %u\n",
2771  dev->name, index, dev->real_num_rx_queues);
2772  goto done;
2773  }
2774  rxqueue = dev->_rx + index;
2775  } else
2776  rxqueue = dev->_rx;
2777 
2778  map = rcu_dereference(rxqueue->rps_map);
2779  if (map) {
2780  if (map->len == 1 &&
2781  !rcu_access_pointer(rxqueue->rps_flow_table)) {
2782  tcpu = map->cpus[0];
2783  if (cpu_online(tcpu))
2784  cpu = tcpu;
2785  goto done;
2786  }
2787  } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2788  goto done;
2789  }
2790 
2791  skb_reset_network_header(skb);
2792  if (!skb_get_rxhash(skb))
2793  goto done;
2794 
2795  flow_table = rcu_dereference(rxqueue->rps_flow_table);
2796  sock_flow_table = rcu_dereference(rps_sock_flow_table);
2797  if (flow_table && sock_flow_table) {
2798  u16 next_cpu;
2799  struct rps_dev_flow *rflow;
2800 
2801  rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2802  tcpu = rflow->cpu;
2803 
2804  next_cpu = sock_flow_table->ents[skb->rxhash &
2805  sock_flow_table->mask];
2806 
2807  /*
2808  * If the desired CPU (where last recvmsg was done) is
2809  * different from current CPU (one in the rx-queue flow
2810  * table entry), switch if one of the following holds:
2811  * - Current CPU is unset (equal to RPS_NO_CPU).
2812  * - Current CPU is offline.
2813  * - The current CPU's queue tail has advanced beyond the
2814  * last packet that was enqueued using this table entry.
2815  * This guarantees that all previous packets for the flow
2816  * have been dequeued, thus preserving in order delivery.
2817  */
2818  if (unlikely(tcpu != next_cpu) &&
2819  (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2820  ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2821  rflow->last_qtail)) >= 0)) {
2822  tcpu = next_cpu;
2823  rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2824  }
2825 
2826  if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2827  *rflowp = rflow;
2828  cpu = tcpu;
2829  goto done;
2830  }
2831  }
2832 
2833  if (map) {
2834  tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2835 
2836  if (cpu_online(tcpu)) {
2837  cpu = tcpu;
2838  goto done;
2839  }
2840  }
2841 
2842 done:
2843  return cpu;
2844 }
2845 
2846 #ifdef CONFIG_RFS_ACCEL
2847 
2859 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2860  u32 flow_id, u16 filter_id)
2861 {
2862  struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2863  struct rps_dev_flow_table *flow_table;
2864  struct rps_dev_flow *rflow;
2865  bool expire = true;
2866  int cpu;
2867 
2868  rcu_read_lock();
2869  flow_table = rcu_dereference(rxqueue->rps_flow_table);
2870  if (flow_table && flow_id <= flow_table->mask) {
2871  rflow = &flow_table->flows[flow_id];
2872  cpu = ACCESS_ONCE(rflow->cpu);
2873  if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2874  ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2875  rflow->last_qtail) <
2876  (int)(10 * flow_table->mask)))
2877  expire = false;
2878  }
2879  rcu_read_unlock();
2880  return expire;
2881 }
2882 EXPORT_SYMBOL(rps_may_expire_flow);
2883 
2884 #endif /* CONFIG_RFS_ACCEL */
2885 
2886 /* Called from hardirq (IPI) context */
2887 static void rps_trigger_softirq(void *data)
2888 {
2889  struct softnet_data *sd = data;
2890 
2891  ____napi_schedule(sd, &sd->backlog);
2892  sd->received_rps++;
2893 }
2894 
2895 #endif /* CONFIG_RPS */
2896 
2897 /*
2898  * Check if this softnet_data structure is another cpu one
2899  * If yes, queue it to our IPI list and return 1
2900  * If no, return 0
2901  */
2902 static int rps_ipi_queued(struct softnet_data *sd)
2903 {
2904 #ifdef CONFIG_RPS
2905  struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2906 
2907  if (sd != mysd) {
2908  sd->rps_ipi_next = mysd->rps_ipi_list;
2909  mysd->rps_ipi_list = sd;
2910 
2912  return 1;
2913  }
2914 #endif /* CONFIG_RPS */
2915  return 0;
2916 }
2917 
2918 /*
2919  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2920  * queue (may be a remote CPU queue).
2921  */
2922 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2923  unsigned int *qtail)
2924 {
2925  struct softnet_data *sd;
2926  unsigned long flags;
2927 
2928  sd = &per_cpu(softnet_data, cpu);
2929 
2930  local_irq_save(flags);
2931 
2932  rps_lock(sd);
2933  if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2934  if (skb_queue_len(&sd->input_pkt_queue)) {
2935 enqueue:
2936  __skb_queue_tail(&sd->input_pkt_queue, skb);
2937  input_queue_tail_incr_save(sd, qtail);
2938  rps_unlock(sd);
2939  local_irq_restore(flags);
2940  return NET_RX_SUCCESS;
2941  }
2942 
2943  /* Schedule NAPI for backlog device
2944  * We can use non atomic operation since we own the queue lock
2945  */
2946  if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2947  if (!rps_ipi_queued(sd))
2948  ____napi_schedule(sd, &sd->backlog);
2949  }
2950  goto enqueue;
2951  }
2952 
2953  sd->dropped++;
2954  rps_unlock(sd);
2955 
2956  local_irq_restore(flags);
2957 
2958  atomic_long_inc(&skb->dev->rx_dropped);
2959  kfree_skb(skb);
2960  return NET_RX_DROP;
2961 }
2962 
2978 int netif_rx(struct sk_buff *skb)
2979 {
2980  int ret;
2981 
2982  /* if netpoll wants it, pretend we never saw it */
2983  if (netpoll_rx(skb))
2984  return NET_RX_DROP;
2985 
2987 
2988  trace_netif_rx(skb);
2989 #ifdef CONFIG_RPS
2990  if (static_key_false(&rps_needed)) {
2991  struct rps_dev_flow voidflow, *rflow = &voidflow;
2992  int cpu;
2993 
2994  preempt_disable();
2995  rcu_read_lock();
2996 
2997  cpu = get_rps_cpu(skb->dev, skb, &rflow);
2998  if (cpu < 0)
2999  cpu = smp_processor_id();
3000 
3001  ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3002 
3003  rcu_read_unlock();
3004  preempt_enable();
3005  } else
3006 #endif
3007  {
3008  unsigned int qtail;
3009  ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3010  put_cpu();
3011  }
3012  return ret;
3013 }
3015 
3016 int netif_rx_ni(struct sk_buff *skb)
3017 {
3018  int err;
3019 
3020  preempt_disable();
3021  err = netif_rx(skb);
3022  if (local_softirq_pending())
3023  do_softirq();
3024  preempt_enable();
3025 
3026  return err;
3027 }
3029 
3030 static void net_tx_action(struct softirq_action *h)
3031 {
3032  struct softnet_data *sd = &__get_cpu_var(softnet_data);
3033 
3034  if (sd->completion_queue) {
3035  struct sk_buff *clist;
3036 
3038  clist = sd->completion_queue;
3039  sd->completion_queue = NULL;
3040  local_irq_enable();
3041 
3042  while (clist) {
3043  struct sk_buff *skb = clist;
3044  clist = clist->next;
3045 
3046  WARN_ON(atomic_read(&skb->users));
3047  trace_kfree_skb(skb, net_tx_action);
3048  __kfree_skb(skb);
3049  }
3050  }
3051 
3052  if (sd->output_queue) {
3053  struct Qdisc *head;
3054 
3056  head = sd->output_queue;
3057  sd->output_queue = NULL;
3058  sd->output_queue_tailp = &sd->output_queue;
3059  local_irq_enable();
3060 
3061  while (head) {
3062  struct Qdisc *q = head;
3063  spinlock_t *root_lock;
3064 
3065  head = head->next_sched;
3066 
3067  root_lock = qdisc_lock(q);
3068  if (spin_trylock(root_lock)) {
3071  &q->state);
3072  qdisc_run(q);
3073  spin_unlock(root_lock);
3074  } else {
3076  &q->state)) {
3077  __netif_reschedule(q);
3078  } else {
3081  &q->state);
3082  }
3083  }
3084  }
3085  }
3086 }
3087 
3088 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3089  (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3090 /* This hook is defined here for ATM LANE */
3091 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3092  unsigned char *addr) __read_mostly;
3093 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3094 #endif
3095 
3096 #ifdef CONFIG_NET_CLS_ACT
3097 /* TODO: Maybe we should just force sch_ingress to be compiled in
3098  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3099  * a compare and 2 stores extra right now if we dont have it on
3100  * but have CONFIG_NET_CLS_ACT
3101  * NOTE: This doesn't stop any functionality; if you dont have
3102  * the ingress scheduler, you just can't add policies on ingress.
3103  *
3104  */
3105 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3106 {
3107  struct net_device *dev = skb->dev;
3108  u32 ttl = G_TC_RTTL(skb->tc_verd);
3109  int result = TC_ACT_OK;
3110  struct Qdisc *q;
3111 
3112  if (unlikely(MAX_RED_LOOP < ttl++)) {
3113  net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3114  skb->skb_iif, dev->ifindex);
3115  return TC_ACT_SHOT;
3116  }
3117 
3118  skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3119  skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3120 
3121  q = rxq->qdisc;
3122  if (q != &noop_qdisc) {
3123  spin_lock(qdisc_lock(q));
3125  result = qdisc_enqueue_root(skb, q);
3126  spin_unlock(qdisc_lock(q));
3127  }
3128 
3129  return result;
3130 }
3131 
3132 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3133  struct packet_type **pt_prev,
3134  int *ret, struct net_device *orig_dev)
3135 {
3136  struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3137 
3138  if (!rxq || rxq->qdisc == &noop_qdisc)
3139  goto out;
3140 
3141  if (*pt_prev) {
3142  *ret = deliver_skb(skb, *pt_prev, orig_dev);
3143  *pt_prev = NULL;
3144  }
3145 
3146  switch (ing_filter(skb, rxq)) {
3147  case TC_ACT_SHOT:
3148  case TC_ACT_STOLEN:
3149  kfree_skb(skb);
3150  return NULL;
3151  }
3152 
3153 out:
3154  skb->tc_verd = 0;
3155  return skb;
3156 }
3157 #endif
3158 
3174  rx_handler_func_t *rx_handler,
3175  void *rx_handler_data)
3176 {
3177  ASSERT_RTNL();
3178 
3179  if (dev->rx_handler)
3180  return -EBUSY;
3181 
3182  rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3183  rcu_assign_pointer(dev->rx_handler, rx_handler);
3184 
3185  return 0;
3186 }
3188 
3198 {
3199 
3200  ASSERT_RTNL();
3203 }
3205 
3206 /*
3207  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3208  * the special handling of PFMEMALLOC skbs.
3209  */
3210 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3211 {
3212  switch (skb->protocol) {
3214  case __constant_htons(ETH_P_IP):
3217  return true;
3218  default:
3219  return false;
3220  }
3221 }
3222 
3223 static int __netif_receive_skb(struct sk_buff *skb)
3224 {
3225  struct packet_type *ptype, *pt_prev;
3226  rx_handler_func_t *rx_handler;
3227  struct net_device *orig_dev;
3228  struct net_device *null_or_dev;
3229  bool deliver_exact = false;
3230  int ret = NET_RX_DROP;
3231  __be16 type;
3232  unsigned long pflags = current->flags;
3233 
3234  net_timestamp_check(!netdev_tstamp_prequeue, skb);
3235 
3236  trace_netif_receive_skb(skb);
3237 
3238  /*
3239  * PFMEMALLOC skbs are special, they should
3240  * - be delivered to SOCK_MEMALLOC sockets only
3241  * - stay away from userspace
3242  * - have bounded memory usage
3243  *
3244  * Use PF_MEMALLOC as this saves us from propagating the allocation
3245  * context down to all allocation sites.
3246  */
3247  if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3248  current->flags |= PF_MEMALLOC;
3249 
3250  /* if we've gotten here through NAPI, check netpoll */
3251  if (netpoll_receive_skb(skb))
3252  goto out;
3253 
3254  orig_dev = skb->dev;
3255 
3256  skb_reset_network_header(skb);
3257  skb_reset_transport_header(skb);
3258  skb_reset_mac_len(skb);
3259 
3260  pt_prev = NULL;
3261 
3262  rcu_read_lock();
3263 
3264 another_round:
3265  skb->skb_iif = skb->dev->ifindex;
3266 
3268 
3269  if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3270  skb = vlan_untag(skb);
3271  if (unlikely(!skb))
3272  goto unlock;
3273  }
3274 
3275 #ifdef CONFIG_NET_CLS_ACT
3276  if (skb->tc_verd & TC_NCLS) {
3277  skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3278  goto ncls;
3279  }
3280 #endif
3281 
3282  if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3283  goto skip_taps;
3284 
3285  list_for_each_entry_rcu(ptype, &ptype_all, list) {
3286  if (!ptype->dev || ptype->dev == skb->dev) {
3287  if (pt_prev)
3288  ret = deliver_skb(skb, pt_prev, orig_dev);
3289  pt_prev = ptype;
3290  }
3291  }
3292 
3293 skip_taps:
3294 #ifdef CONFIG_NET_CLS_ACT
3295  skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3296  if (!skb)
3297  goto unlock;
3298 ncls:
3299 #endif
3300 
3301  if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3302  && !skb_pfmemalloc_protocol(skb))
3303  goto drop;
3304 
3305  if (vlan_tx_tag_present(skb)) {
3306  if (pt_prev) {
3307  ret = deliver_skb(skb, pt_prev, orig_dev);
3308  pt_prev = NULL;
3309  }
3310  if (vlan_do_receive(&skb))
3311  goto another_round;
3312  else if (unlikely(!skb))
3313  goto unlock;
3314  }
3315 
3316  rx_handler = rcu_dereference(skb->dev->rx_handler);
3317  if (rx_handler) {
3318  if (pt_prev) {
3319  ret = deliver_skb(skb, pt_prev, orig_dev);
3320  pt_prev = NULL;
3321  }
3322  switch (rx_handler(&skb)) {
3323  case RX_HANDLER_CONSUMED:
3324  goto unlock;
3325  case RX_HANDLER_ANOTHER:
3326  goto another_round;
3327  case RX_HANDLER_EXACT:
3328  deliver_exact = true;
3329  case RX_HANDLER_PASS:
3330  break;
3331  default:
3332  BUG();
3333  }
3334  }
3335 
3336  if (vlan_tx_nonzero_tag_present(skb))
3337  skb->pkt_type = PACKET_OTHERHOST;
3338 
3339  /* deliver only exact match when indicated */
3340  null_or_dev = deliver_exact ? skb->dev : NULL;
3341 
3342  type = skb->protocol;
3343  list_for_each_entry_rcu(ptype,
3344  &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3345  if (ptype->type == type &&
3346  (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3347  ptype->dev == orig_dev)) {
3348  if (pt_prev)
3349  ret = deliver_skb(skb, pt_prev, orig_dev);
3350  pt_prev = ptype;
3351  }
3352  }
3353 
3354  if (pt_prev) {
3355  if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3356  goto drop;
3357  else
3358  ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3359  } else {
3360 drop:
3361  atomic_long_inc(&skb->dev->rx_dropped);
3362  kfree_skb(skb);
3363  /* Jamal, now you will not able to escape explaining
3364  * me how you were going to use this. :-)
3365  */
3366  ret = NET_RX_DROP;
3367  }
3368 
3369 unlock:
3370  rcu_read_unlock();
3371 out:
3372  tsk_restore_flags(current, pflags, PF_MEMALLOC);
3373  return ret;
3374 }
3375 
3391 int netif_receive_skb(struct sk_buff *skb)
3392 {
3394 
3395  if (skb_defer_rx_timestamp(skb))
3396  return NET_RX_SUCCESS;
3397 
3398 #ifdef CONFIG_RPS
3399  if (static_key_false(&rps_needed)) {
3400  struct rps_dev_flow voidflow, *rflow = &voidflow;
3401  int cpu, ret;
3402 
3403  rcu_read_lock();
3404 
3405  cpu = get_rps_cpu(skb->dev, skb, &rflow);
3406 
3407  if (cpu >= 0) {
3408  ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3409  rcu_read_unlock();
3410  return ret;
3411  }
3412  rcu_read_unlock();
3413  }
3414 #endif
3415  return __netif_receive_skb(skb);
3416 }
3418 
3419 /* Network device is going away, flush any packets still pending
3420  * Called with irqs disabled.
3421  */
3422 static void flush_backlog(void *arg)
3423 {
3424  struct net_device *dev = arg;
3425  struct softnet_data *sd = &__get_cpu_var(softnet_data);
3426  struct sk_buff *skb, *tmp;
3427 
3428  rps_lock(sd);
3429  skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3430  if (skb->dev == dev) {
3431  __skb_unlink(skb, &sd->input_pkt_queue);
3432  kfree_skb(skb);
3433  input_queue_head_incr(sd);
3434  }
3435  }
3436  rps_unlock(sd);
3437 
3438  skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3439  if (skb->dev == dev) {
3440  __skb_unlink(skb, &sd->process_queue);
3441  kfree_skb(skb);
3442  input_queue_head_incr(sd);
3443  }
3444  }
3445 }
3446 
3447 static int napi_gro_complete(struct sk_buff *skb)
3448 {
3449  struct packet_type *ptype;
3450  __be16 type = skb->protocol;
3451  struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3452  int err = -ENOENT;
3453 
3454  BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3455 
3456  if (NAPI_GRO_CB(skb)->count == 1) {
3457  skb_shinfo(skb)->gso_size = 0;
3458  goto out;
3459  }
3460 
3461  rcu_read_lock();
3462  list_for_each_entry_rcu(ptype, head, list) {
3463  if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3464  continue;
3465 
3466  err = ptype->gro_complete(skb);
3467  break;
3468  }
3469  rcu_read_unlock();
3470 
3471  if (err) {
3472  WARN_ON(&ptype->list == head);
3473  kfree_skb(skb);
3474  return NET_RX_SUCCESS;
3475  }
3476 
3477 out:
3478  return netif_receive_skb(skb);
3479 }
3480 
3481 /* napi->gro_list contains packets ordered by age.
3482  * youngest packets at the head of it.
3483  * Complete skbs in reverse order to reduce latencies.
3484  */
3485 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3486 {
3487  struct sk_buff *skb, *prev = NULL;
3488 
3489  /* scan list and build reverse chain */
3490  for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3491  skb->prev = prev;
3492  prev = skb;
3493  }
3494 
3495  for (skb = prev; skb; skb = prev) {
3496  skb->next = NULL;
3497 
3498  if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3499  return;
3500 
3501  prev = skb->prev;
3502  napi_gro_complete(skb);
3503  napi->gro_count--;
3504  }
3505 
3506  napi->gro_list = NULL;
3507 }
3509 
3511 {
3512  struct sk_buff **pp = NULL;
3513  struct packet_type *ptype;
3514  __be16 type = skb->protocol;
3515  struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3516  int same_flow;
3517  int mac_len;
3518  enum gro_result ret;
3519 
3520  if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3521  goto normal;
3522 
3523  if (skb_is_gso(skb) || skb_has_frag_list(skb))
3524  goto normal;
3525 
3526  rcu_read_lock();
3527  list_for_each_entry_rcu(ptype, head, list) {
3528  if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3529  continue;
3530 
3531  skb_set_network_header(skb, skb_gro_offset(skb));
3532  mac_len = skb->network_header - skb->mac_header;
3533  skb->mac_len = mac_len;
3534  NAPI_GRO_CB(skb)->same_flow = 0;
3535  NAPI_GRO_CB(skb)->flush = 0;
3536  NAPI_GRO_CB(skb)->free = 0;
3537 
3538  pp = ptype->gro_receive(&napi->gro_list, skb);
3539  break;
3540  }
3541  rcu_read_unlock();
3542 
3543  if (&ptype->list == head)
3544  goto normal;
3545 
3546  same_flow = NAPI_GRO_CB(skb)->same_flow;
3547  ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3548 
3549  if (pp) {
3550  struct sk_buff *nskb = *pp;
3551 
3552  *pp = nskb->next;
3553  nskb->next = NULL;
3554  napi_gro_complete(nskb);
3555  napi->gro_count--;
3556  }
3557 
3558  if (same_flow)
3559  goto ok;
3560 
3561  if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3562  goto normal;
3563 
3564  napi->gro_count++;
3565  NAPI_GRO_CB(skb)->count = 1;
3566  NAPI_GRO_CB(skb)->age = jiffies;
3567  skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3568  skb->next = napi->gro_list;
3569  napi->gro_list = skb;
3570  ret = GRO_HELD;
3571 
3572 pull:
3573  if (skb_headlen(skb) < skb_gro_offset(skb)) {
3574  int grow = skb_gro_offset(skb) - skb_headlen(skb);
3575 
3576  BUG_ON(skb->end - skb->tail < grow);
3577 
3578  memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3579 
3580  skb->tail += grow;
3581  skb->data_len -= grow;
3582 
3583  skb_shinfo(skb)->frags[0].page_offset += grow;
3584  skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3585 
3586  if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3587  skb_frag_unref(skb, 0);
3588  memmove(skb_shinfo(skb)->frags,
3589  skb_shinfo(skb)->frags + 1,
3590  --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3591  }
3592  }
3593 
3594 ok:
3595  return ret;
3596 
3597 normal:
3598  ret = GRO_NORMAL;
3599  goto pull;
3600 }
3602 
3603 static inline gro_result_t
3604 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3605 {
3606  struct sk_buff *p;
3607  unsigned int maclen = skb->dev->hard_header_len;
3608 
3609  for (p = napi->gro_list; p; p = p->next) {
3610  unsigned long diffs;
3611 
3612  diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3613  diffs |= p->vlan_tci ^ skb->vlan_tci;
3614  if (maclen == ETH_HLEN)
3615  diffs |= compare_ether_header(skb_mac_header(p),
3616  skb_gro_mac_header(skb));
3617  else if (!diffs)
3618  diffs = memcmp(skb_mac_header(p),
3619  skb_gro_mac_header(skb),
3620  maclen);
3621  NAPI_GRO_CB(p)->same_flow = !diffs;
3622  NAPI_GRO_CB(p)->flush = 0;
3623  }
3624 
3625  return dev_gro_receive(napi, skb);
3626 }
3627 
3629 {
3630  switch (ret) {
3631  case GRO_NORMAL:
3632  if (netif_receive_skb(skb))
3633  ret = GRO_DROP;
3634  break;
3635 
3636  case GRO_DROP:
3637  kfree_skb(skb);
3638  break;
3639 
3640  case GRO_MERGED_FREE:
3642  kmem_cache_free(skbuff_head_cache, skb);
3643  else
3644  __kfree_skb(skb);
3645  break;
3646 
3647  case GRO_HELD:
3648  case GRO_MERGED:
3649  break;
3650  }
3651 
3652  return ret;
3653 }
3655 
3656 static void skb_gro_reset_offset(struct sk_buff *skb)
3657 {
3658  const struct skb_shared_info *pinfo = skb_shinfo(skb);
3659  const skb_frag_t *frag0 = &pinfo->frags[0];
3660 
3661  NAPI_GRO_CB(skb)->data_offset = 0;
3662  NAPI_GRO_CB(skb)->frag0 = NULL;
3663  NAPI_GRO_CB(skb)->frag0_len = 0;
3664 
3665  if (skb->mac_header == skb->tail &&
3666  pinfo->nr_frags &&
3667  !PageHighMem(skb_frag_page(frag0))) {
3668  NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3669  NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3670  }
3671 }
3672 
3674 {
3675  skb_gro_reset_offset(skb);
3676 
3677  return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3678 }
3680 
3681 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3682 {
3683  __skb_pull(skb, skb_headlen(skb));
3684  /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3685  skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3686  skb->vlan_tci = 0;
3687  skb->dev = napi->dev;
3688  skb->skb_iif = 0;
3689 
3690  napi->skb = skb;
3691 }
3692 
3693 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3694 {
3695  struct sk_buff *skb = napi->skb;
3696 
3697  if (!skb) {
3698  skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3699  if (skb)
3700  napi->skb = skb;
3701  }
3702  return skb;
3703 }
3705 
3707  gro_result_t ret)
3708 {
3709  switch (ret) {
3710  case GRO_NORMAL:
3711  case GRO_HELD:
3712  skb->protocol = eth_type_trans(skb, skb->dev);
3713 
3714  if (ret == GRO_HELD)
3715  skb_gro_pull(skb, -ETH_HLEN);
3716  else if (netif_receive_skb(skb))
3717  ret = GRO_DROP;
3718  break;
3719 
3720  case GRO_DROP:
3721  case GRO_MERGED_FREE:
3722  napi_reuse_skb(napi, skb);
3723  break;
3724 
3725  case GRO_MERGED:
3726  break;
3727  }
3728 
3729  return ret;
3730 }
3732 
3733 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3734 {
3735  struct sk_buff *skb = napi->skb;
3736  struct ethhdr *eth;
3737  unsigned int hlen;
3738  unsigned int off;
3739 
3740  napi->skb = NULL;
3741 
3742  skb_reset_mac_header(skb);
3743  skb_gro_reset_offset(skb);
3744 
3745  off = skb_gro_offset(skb);
3746  hlen = off + sizeof(*eth);
3747  eth = skb_gro_header_fast(skb, off);
3748  if (skb_gro_header_hard(skb, hlen)) {
3749  eth = skb_gro_header_slow(skb, hlen, off);
3750  if (unlikely(!eth)) {
3751  napi_reuse_skb(napi, skb);
3752  skb = NULL;
3753  goto out;
3754  }
3755  }
3756 
3757  skb_gro_pull(skb, sizeof(*eth));
3758 
3759  /*
3760  * This works because the only protocols we care about don't require
3761  * special handling. We'll fix it up properly at the end.
3762  */
3763  skb->protocol = eth->h_proto;
3764 
3765 out:
3766  return skb;
3767 }
3768 
3770 {
3771  struct sk_buff *skb = napi_frags_skb(napi);
3772 
3773  if (!skb)
3774  return GRO_DROP;
3775 
3776  return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3777 }
3779 
3780 /*
3781  * net_rps_action sends any pending IPI's for rps.
3782  * Note: called with local irq disabled, but exits with local irq enabled.
3783  */
3784 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3785 {
3786 #ifdef CONFIG_RPS
3787  struct softnet_data *remsd = sd->rps_ipi_list;
3788 
3789  if (remsd) {
3790  sd->rps_ipi_list = NULL;
3791 
3792  local_irq_enable();
3793 
3794  /* Send pending IPI's to kick RPS processing on remote cpus. */
3795  while (remsd) {
3796  struct softnet_data *next = remsd->rps_ipi_next;
3797 
3798  if (cpu_online(remsd->cpu))
3799  __smp_call_function_single(remsd->cpu,
3800  &remsd->csd, 0);
3801  remsd = next;
3802  }
3803  } else
3804 #endif
3805  local_irq_enable();
3806 }
3807 
3808 static int process_backlog(struct napi_struct *napi, int quota)
3809 {
3810  int work = 0;
3811  struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3812 
3813 #ifdef CONFIG_RPS
3814  /* Check if we have pending ipi, its better to send them now,
3815  * not waiting net_rx_action() end.
3816  */
3817  if (sd->rps_ipi_list) {
3819  net_rps_action_and_irq_enable(sd);
3820  }
3821 #endif
3822  napi->weight = weight_p;
3824  while (work < quota) {
3825  struct sk_buff *skb;
3826  unsigned int qlen;
3827 
3828  while ((skb = __skb_dequeue(&sd->process_queue))) {
3829  local_irq_enable();
3830  __netif_receive_skb(skb);
3832  input_queue_head_incr(sd);
3833  if (++work >= quota) {
3834  local_irq_enable();
3835  return work;
3836  }
3837  }
3838 
3839  rps_lock(sd);
3840  qlen = skb_queue_len(&sd->input_pkt_queue);
3841  if (qlen)
3842  skb_queue_splice_tail_init(&sd->input_pkt_queue,
3843  &sd->process_queue);
3844 
3845  if (qlen < quota - work) {
3846  /*
3847  * Inline a custom version of __napi_complete().
3848  * only current cpu owns and manipulates this napi,
3849  * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3850  * we can use a plain write instead of clear_bit(),
3851  * and we dont need an smp_mb() memory barrier.
3852  */
3853  list_del(&napi->poll_list);
3854  napi->state = 0;
3855 
3856  quota = work + qlen;
3857  }
3858  rps_unlock(sd);
3859  }
3860  local_irq_enable();
3861 
3862  return work;
3863 }
3864 
3872 {
3873  unsigned long flags;
3874 
3875  local_irq_save(flags);
3876  ____napi_schedule(&__get_cpu_var(softnet_data), n);
3877  local_irq_restore(flags);
3878 }
3880 
3882 {
3884  BUG_ON(n->gro_list);
3885 
3886  list_del(&n->poll_list);
3889 }
3891 
3893 {
3894  unsigned long flags;
3895 
3896  /*
3897  * don't let napi dequeue from the cpu poll list
3898  * just in case its running on a different cpu
3899  */
3901  return;
3902 
3903  napi_gro_flush(n, false);
3904  local_irq_save(flags);
3905  __napi_complete(n);
3906  local_irq_restore(flags);
3907 }
3909 
3910 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3911  int (*poll)(struct napi_struct *, int), int weight)
3912 {
3913  INIT_LIST_HEAD(&napi->poll_list);
3914  napi->gro_count = 0;
3915  napi->gro_list = NULL;
3916  napi->skb = NULL;
3917  napi->poll = poll;
3918  napi->weight = weight;
3919  list_add(&napi->dev_list, &dev->napi_list);
3920  napi->dev = dev;
3921 #ifdef CONFIG_NETPOLL
3922  spin_lock_init(&napi->poll_lock);
3923  napi->poll_owner = -1;
3924 #endif
3925  set_bit(NAPI_STATE_SCHED, &napi->state);
3926 }
3928 
3929 void netif_napi_del(struct napi_struct *napi)
3930 {
3931  struct sk_buff *skb, *next;
3932 
3933  list_del_init(&napi->dev_list);
3934  napi_free_frags(napi);
3935 
3936  for (skb = napi->gro_list; skb; skb = next) {
3937  next = skb->next;
3938  skb->next = NULL;
3939  kfree_skb(skb);
3940  }
3941 
3942  napi->gro_list = NULL;
3943  napi->gro_count = 0;
3944 }
3946 
3947 static void net_rx_action(struct softirq_action *h)
3948 {
3949  struct softnet_data *sd = &__get_cpu_var(softnet_data);
3950  unsigned long time_limit = jiffies + 2;
3951  int budget = netdev_budget;
3952  void *have;
3953 
3955 
3956  while (!list_empty(&sd->poll_list)) {
3957  struct napi_struct *n;
3958  int work, weight;
3959 
3960  /* If softirq window is exhuasted then punt.
3961  * Allow this to run for 2 jiffies since which will allow
3962  * an average latency of 1.5/HZ.
3963  */
3964  if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3965  goto softnet_break;
3966 
3967  local_irq_enable();
3968 
3969  /* Even though interrupts have been re-enabled, this
3970  * access is safe because interrupts can only add new
3971  * entries to the tail of this list, and only ->poll()
3972  * calls can remove this head entry from the list.
3973  */
3974  n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3975 
3976  have = netpoll_poll_lock(n);
3977 
3978  weight = n->weight;
3979 
3980  /* This NAPI_STATE_SCHED test is for avoiding a race
3981  * with netpoll's poll_napi(). Only the entity which
3982  * obtains the lock and sees NAPI_STATE_SCHED set will
3983  * actually make the ->poll() call. Therefore we avoid
3984  * accidentally calling ->poll() when NAPI is not scheduled.
3985  */
3986  work = 0;
3987  if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3988  work = n->poll(n, weight);
3989  trace_napi_poll(n);
3990  }
3991 
3992  WARN_ON_ONCE(work > weight);
3993 
3994  budget -= work;
3995 
3997 
3998  /* Drivers must not modify the NAPI state if they
3999  * consume the entire weight. In such cases this code
4000  * still "owns" the NAPI instance and therefore can
4001  * move the instance around on the list at-will.
4002  */
4003  if (unlikely(work == weight)) {
4004  if (unlikely(napi_disable_pending(n))) {
4005  local_irq_enable();
4006  napi_complete(n);
4008  } else {
4009  if (n->gro_list) {
4010  /* flush too old packets
4011  * If HZ < 1000, flush all packets.
4012  */
4013  local_irq_enable();
4014  napi_gro_flush(n, HZ >= 1000);
4016  }
4017  list_move_tail(&n->poll_list, &sd->poll_list);
4018  }
4019  }
4020 
4021  netpoll_poll_unlock(have);
4022  }
4023 out:
4024  net_rps_action_and_irq_enable(sd);
4025 
4026 #ifdef CONFIG_NET_DMA
4027  /*
4028  * There may not be any more sk_buffs coming right now, so push
4029  * any pending DMA copies to hardware
4030  */
4032 #endif
4033 
4034  return;
4035 
4036 softnet_break:
4037  sd->time_squeeze++;
4039  goto out;
4040 }
4041 
4042 static gifconf_func_t *gifconf_list[NPROTO];
4043 
4053 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4054 {
4055  if (family >= NPROTO)
4056  return -EINVAL;
4057  gifconf_list[family] = gifconf;
4058  return 0;
4059 }
4061 
4062 
4063 /*
4064  * Map an interface index to its name (SIOCGIFNAME)
4065  */
4066 
4067 /*
4068  * We need this ioctl for efficient implementation of the
4069  * if_indextoname() function required by the IPv6 API. Without
4070  * it, we would have to search all the interfaces to find a
4071  * match. --pb
4072  */
4073 
4074 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4075 {
4076  struct net_device *dev;
4077  struct ifreq ifr;
4078 
4079  /*
4080  * Fetch the caller's info block.
4081  */
4082 
4083  if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4084  return -EFAULT;
4085 
4086  rcu_read_lock();
4087  dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4088  if (!dev) {
4089  rcu_read_unlock();
4090  return -ENODEV;
4091  }
4092 
4093  strcpy(ifr.ifr_name, dev->name);
4094  rcu_read_unlock();
4095 
4096  if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4097  return -EFAULT;
4098  return 0;
4099 }
4100 
4101 /*
4102  * Perform a SIOCGIFCONF call. This structure will change
4103  * size eventually, and there is nothing I can do about it.
4104  * Thus we will need a 'compatibility mode'.
4105  */
4106 
4107 static int dev_ifconf(struct net *net, char __user *arg)
4108 {
4109  struct ifconf ifc;
4110  struct net_device *dev;
4111  char __user *pos;
4112  int len;
4113  int total;
4114  int i;
4115 
4116  /*
4117  * Fetch the caller's info block.
4118  */
4119 
4120  if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4121  return -EFAULT;
4122 
4123  pos = ifc.ifc_buf;
4124  len = ifc.ifc_len;
4125 
4126  /*
4127  * Loop over the interfaces, and write an info block for each.
4128  */
4129 
4130  total = 0;
4131  for_each_netdev(net, dev) {
4132  for (i = 0; i < NPROTO; i++) {
4133  if (gifconf_list[i]) {
4134  int done;
4135  if (!pos)
4136  done = gifconf_list[i](dev, NULL, 0);
4137  else
4138  done = gifconf_list[i](dev, pos + total,
4139  len - total);
4140  if (done < 0)
4141  return -EFAULT;
4142  total += done;
4143  }
4144  }
4145  }
4146 
4147  /*
4148  * All done. Write the updated control block back to the caller.
4149  */
4150  ifc.ifc_len = total;
4151 
4152  /*
4153  * Both BSD and Solaris return 0 here, so we do too.
4154  */
4155  return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4156 }
4157 
4158 #ifdef CONFIG_PROC_FS
4159 
4160 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4161 
4162 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4163 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4164 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4165 
4166 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4167 {
4168  struct net *net = seq_file_net(seq);
4169  struct net_device *dev;
4170  struct hlist_node *p;
4171  struct hlist_head *h;
4172  unsigned int count = 0, offset = get_offset(*pos);
4173 
4174  h = &net->dev_name_head[get_bucket(*pos)];
4175  hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4176  if (++count == offset)
4177  return dev;
4178  }
4179 
4180  return NULL;
4181 }
4182 
4183 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4184 {
4185  struct net_device *dev;
4186  unsigned int bucket;
4187 
4188  do {
4189  dev = dev_from_same_bucket(seq, pos);
4190  if (dev)
4191  return dev;
4192 
4193  bucket = get_bucket(*pos) + 1;
4194  *pos = set_bucket_offset(bucket, 1);
4195  } while (bucket < NETDEV_HASHENTRIES);
4196 
4197  return NULL;
4198 }
4199 
4200 /*
4201  * This is invoked by the /proc filesystem handler to display a device
4202  * in detail.
4203  */
4204 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4205  __acquires(RCU)
4206 {
4207  rcu_read_lock();
4208  if (!*pos)
4209  return SEQ_START_TOKEN;
4210 
4211  if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4212  return NULL;
4213 
4214  return dev_from_bucket(seq, pos);
4215 }
4216 
4217 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4218 {
4219  ++*pos;
4220  return dev_from_bucket(seq, pos);
4221 }
4222 
4223 void dev_seq_stop(struct seq_file *seq, void *v)
4224  __releases(RCU)
4225 {
4226  rcu_read_unlock();
4227 }
4228 
4229 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4230 {
4231  struct rtnl_link_stats64 temp;
4232  const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4233 
4234  seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4235  "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4236  dev->name, stats->rx_bytes, stats->rx_packets,
4237  stats->rx_errors,
4238  stats->rx_dropped + stats->rx_missed_errors,
4239  stats->rx_fifo_errors,
4240  stats->rx_length_errors + stats->rx_over_errors +
4241  stats->rx_crc_errors + stats->rx_frame_errors,
4242  stats->rx_compressed, stats->multicast,
4243  stats->tx_bytes, stats->tx_packets,
4244  stats->tx_errors, stats->tx_dropped,
4245  stats->tx_fifo_errors, stats->collisions,
4246  stats->tx_carrier_errors +
4247  stats->tx_aborted_errors +
4248  stats->tx_window_errors +
4249  stats->tx_heartbeat_errors,
4250  stats->tx_compressed);
4251 }
4252 
4253 /*
4254  * Called from the PROCfs module. This now uses the new arbitrary sized
4255  * /proc/net interface to create /proc/net/dev
4256  */
4257 static int dev_seq_show(struct seq_file *seq, void *v)
4258 {
4259  if (v == SEQ_START_TOKEN)
4260  seq_puts(seq, "Inter-| Receive "
4261  " | Transmit\n"
4262  " face |bytes packets errs drop fifo frame "
4263  "compressed multicast|bytes packets errs "
4264  "drop fifo colls carrier compressed\n");
4265  else
4266  dev_seq_printf_stats(seq, v);
4267  return 0;
4268 }
4269 
4270 static struct softnet_data *softnet_get_online(loff_t *pos)
4271 {
4272  struct softnet_data *sd = NULL;
4273 
4274  while (*pos < nr_cpu_ids)
4275  if (cpu_online(*pos)) {
4276  sd = &per_cpu(softnet_data, *pos);
4277  break;
4278  } else
4279  ++*pos;
4280  return sd;
4281 }
4282 
4283 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4284 {
4285  return softnet_get_online(pos);
4286 }
4287 
4288 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4289 {
4290  ++*pos;
4291  return softnet_get_online(pos);
4292 }
4293 
4294 static void softnet_seq_stop(struct seq_file *seq, void *v)
4295 {
4296 }
4297 
4298 static int softnet_seq_show(struct seq_file *seq, void *v)
4299 {
4300  struct softnet_data *sd = v;
4301 
4302  seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4303  sd->processed, sd->dropped, sd->time_squeeze, 0,
4304  0, 0, 0, 0, /* was fastroute */
4305  sd->cpu_collision, sd->received_rps);
4306  return 0;
4307 }
4308 
4309 static const struct seq_operations dev_seq_ops = {
4310  .start = dev_seq_start,
4311  .next = dev_seq_next,
4312  .stop = dev_seq_stop,
4313  .show = dev_seq_show,
4314 };
4315 
4316 static int dev_seq_open(struct inode *inode, struct file *file)
4317 {
4318  return seq_open_net(inode, file, &dev_seq_ops,
4319  sizeof(struct seq_net_private));
4320 }
4321 
4322 static const struct file_operations dev_seq_fops = {
4323  .owner = THIS_MODULE,
4324  .open = dev_seq_open,
4325  .read = seq_read,
4326  .llseek = seq_lseek,
4327  .release = seq_release_net,
4328 };
4329 
4330 static const struct seq_operations softnet_seq_ops = {
4331  .start = softnet_seq_start,
4332  .next = softnet_seq_next,
4333  .stop = softnet_seq_stop,
4334  .show = softnet_seq_show,
4335 };
4336 
4337 static int softnet_seq_open(struct inode *inode, struct file *file)
4338 {
4339  return seq_open(file, &softnet_seq_ops);
4340 }
4341 
4342 static const struct file_operations softnet_seq_fops = {
4343  .owner = THIS_MODULE,
4344  .open = softnet_seq_open,
4345  .read = seq_read,
4346  .llseek = seq_lseek,
4347  .release = seq_release,
4348 };
4349 
4350 static void *ptype_get_idx(loff_t pos)
4351 {
4352  struct packet_type *pt = NULL;
4353  loff_t i = 0;
4354  int t;
4355 
4356  list_for_each_entry_rcu(pt, &ptype_all, list) {
4357  if (i == pos)
4358  return pt;
4359  ++i;
4360  }
4361 
4362  for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4363  list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4364  if (i == pos)
4365  return pt;
4366  ++i;
4367  }
4368  }
4369  return NULL;
4370 }
4371 
4372 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4373  __acquires(RCU)
4374 {
4375  rcu_read_lock();
4376  return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4377 }
4378 
4379 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4380 {
4381  struct packet_type *pt;
4382  struct list_head *nxt;
4383  int hash;
4384 
4385  ++*pos;
4386  if (v == SEQ_START_TOKEN)
4387  return ptype_get_idx(0);
4388 
4389  pt = v;
4390  nxt = pt->list.next;
4391  if (pt->type == htons(ETH_P_ALL)) {
4392  if (nxt != &ptype_all)
4393  goto found;
4394  hash = 0;
4395  nxt = ptype_base[0].next;
4396  } else
4397  hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4398 
4399  while (nxt == &ptype_base[hash]) {
4400  if (++hash >= PTYPE_HASH_SIZE)
4401  return NULL;
4402  nxt = ptype_base[hash].next;
4403  }
4404 found:
4405  return list_entry(nxt, struct packet_type, list);
4406 }
4407 
4408 static void ptype_seq_stop(struct seq_file *seq, void *v)
4409  __releases(RCU)
4410 {
4411  rcu_read_unlock();
4412 }
4413 
4414 static int ptype_seq_show(struct seq_file *seq, void *v)
4415 {
4416  struct packet_type *pt = v;
4417 
4418  if (v == SEQ_START_TOKEN)
4419  seq_puts(seq, "Type Device Function\n");
4420  else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4421  if (pt->type == htons(ETH_P_ALL))
4422  seq_puts(seq, "ALL ");
4423  else
4424  seq_printf(seq, "%04x", ntohs(pt->type));
4425 
4426  seq_printf(seq, " %-8s %pF\n",
4427  pt->dev ? pt->dev->name : "", pt->func);
4428  }
4429 
4430  return 0;
4431 }
4432 
4433 static const struct seq_operations ptype_seq_ops = {
4434  .start = ptype_seq_start,
4435  .next = ptype_seq_next,
4436  .stop = ptype_seq_stop,
4437  .show = ptype_seq_show,
4438 };
4439 
4440 static int ptype_seq_open(struct inode *inode, struct file *file)
4441 {
4442  return seq_open_net(inode, file, &ptype_seq_ops,
4443  sizeof(struct seq_net_private));
4444 }
4445 
4446 static const struct file_operations ptype_seq_fops = {
4447  .owner = THIS_MODULE,
4448  .open = ptype_seq_open,
4449  .read = seq_read,
4450  .llseek = seq_lseek,
4451  .release = seq_release_net,
4452 };
4453 
4454 
4455 static int __net_init dev_proc_net_init(struct net *net)
4456 {
4457  int rc = -ENOMEM;
4458 
4459  if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4460  goto out;
4461  if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4462  goto out_dev;
4463  if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4464  goto out_softnet;
4465 
4466  if (wext_proc_init(net))
4467  goto out_ptype;
4468  rc = 0;
4469 out:
4470  return rc;
4471 out_ptype:
4472  proc_net_remove(net, "ptype");
4473 out_softnet:
4474  proc_net_remove(net, "softnet_stat");
4475 out_dev:
4476  proc_net_remove(net, "dev");
4477  goto out;
4478 }
4479 
4480 static void __net_exit dev_proc_net_exit(struct net *net)
4481 {
4482  wext_proc_exit(net);
4483 
4484  proc_net_remove(net, "ptype");
4485  proc_net_remove(net, "softnet_stat");
4486  proc_net_remove(net, "dev");
4487 }
4488 
4489 static struct pernet_operations __net_initdata dev_proc_ops = {
4490  .init = dev_proc_net_init,
4491  .exit = dev_proc_net_exit,
4492 };
4493 
4494 static int __init dev_proc_init(void)
4495 {
4496  return register_pernet_subsys(&dev_proc_ops);
4497 }
4498 #else
4499 #define dev_proc_init() 0
4500 #endif /* CONFIG_PROC_FS */
4501 
4502 
4513 int netdev_set_master(struct net_device *slave, struct net_device *master)
4514 {
4515  struct net_device *old = slave->master;
4516 
4517  ASSERT_RTNL();
4518 
4519  if (master) {
4520  if (old)
4521  return -EBUSY;
4522  dev_hold(master);
4523  }
4524 
4525  slave->master = master;
4526 
4527  if (old)
4528  dev_put(old);
4529  return 0;
4530 }
4532 
4544 {
4545  int err;
4546 
4547  ASSERT_RTNL();
4548 
4549  err = netdev_set_master(slave, master);
4550  if (err)
4551  return err;
4552  if (master)
4553  slave->flags |= IFF_SLAVE;
4554  else
4555  slave->flags &= ~IFF_SLAVE;
4556 
4558  return 0;
4559 }
4561 
4562 static void dev_change_rx_flags(struct net_device *dev, int flags)
4563 {
4564  const struct net_device_ops *ops = dev->netdev_ops;
4565 
4566  if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4567  ops->ndo_change_rx_flags(dev, flags);
4568 }
4569 
4570 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4571 {
4572  unsigned int old_flags = dev->flags;
4573  kuid_t uid;
4574  kgid_t gid;
4575 
4576  ASSERT_RTNL();
4577 
4578  dev->flags |= IFF_PROMISC;
4579  dev->promiscuity += inc;
4580  if (dev->promiscuity == 0) {
4581  /*
4582  * Avoid overflow.
4583  * If inc causes overflow, untouch promisc and return error.
4584  */
4585  if (inc < 0)
4586  dev->flags &= ~IFF_PROMISC;
4587  else {
4588  dev->promiscuity -= inc;
4589  pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4590  dev->name);
4591  return -EOVERFLOW;
4592  }
4593  }
4594  if (dev->flags != old_flags) {
4595  pr_info("device %s %s promiscuous mode\n",
4596  dev->name,
4597  dev->flags & IFF_PROMISC ? "entered" : "left");
4598  if (audit_enabled) {
4599  current_uid_gid(&uid, &gid);
4600  audit_log(current->audit_context, GFP_ATOMIC,
4602  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4603  dev->name, (dev->flags & IFF_PROMISC),
4604  (old_flags & IFF_PROMISC),
4605  from_kuid(&init_user_ns, audit_get_loginuid(current)),
4606  from_kuid(&init_user_ns, uid),
4607  from_kgid(&init_user_ns, gid),
4608  audit_get_sessionid(current));
4609  }
4610 
4611  dev_change_rx_flags(dev, IFF_PROMISC);
4612  }
4613  return 0;
4614 }
4615 
4627 int dev_set_promiscuity(struct net_device *dev, int inc)
4628 {
4629  unsigned int old_flags = dev->flags;
4630  int err;
4631 
4632  err = __dev_set_promiscuity(dev, inc);
4633  if (err < 0)
4634  return err;
4635  if (dev->flags != old_flags)
4636  dev_set_rx_mode(dev);
4637  return err;
4638 }
4640 
4654 int dev_set_allmulti(struct net_device *dev, int inc)
4655 {
4656  unsigned int old_flags = dev->flags;
4657 
4658  ASSERT_RTNL();
4659 
4660  dev->flags |= IFF_ALLMULTI;
4661  dev->allmulti += inc;
4662  if (dev->allmulti == 0) {
4663  /*
4664  * Avoid overflow.
4665  * If inc causes overflow, untouch allmulti and return error.
4666  */
4667  if (inc < 0)
4668  dev->flags &= ~IFF_ALLMULTI;
4669  else {
4670  dev->allmulti -= inc;
4671  pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4672  dev->name);
4673  return -EOVERFLOW;
4674  }
4675  }
4676  if (dev->flags ^ old_flags) {
4677  dev_change_rx_flags(dev, IFF_ALLMULTI);
4678  dev_set_rx_mode(dev);
4679  }
4680  return 0;
4681 }
4683 
4684 /*
4685  * Upload unicast and multicast address lists to device and
4686  * configure RX filtering. When the device doesn't support unicast
4687  * filtering it is put in promiscuous mode while unicast addresses
4688  * are present.
4689  */
4691 {
4692  const struct net_device_ops *ops = dev->netdev_ops;
4693 
4694  /* dev_open will call this function so the list will stay sane. */
4695  if (!(dev->flags&IFF_UP))
4696  return;
4697 
4698  if (!netif_device_present(dev))
4699  return;
4700 
4701  if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4702  /* Unicast addresses changes may only happen under the rtnl,
4703  * therefore calling __dev_set_promiscuity here is safe.
4704  */
4705  if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4706  __dev_set_promiscuity(dev, 1);
4707  dev->uc_promisc = true;
4708  } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4709  __dev_set_promiscuity(dev, -1);
4710  dev->uc_promisc = false;
4711  }
4712  }
4713 
4714  if (ops->ndo_set_rx_mode)
4715  ops->ndo_set_rx_mode(dev);
4716 }
4717 
4718 void dev_set_rx_mode(struct net_device *dev)
4719 {
4720  netif_addr_lock_bh(dev);
4721  __dev_set_rx_mode(dev);
4722  netif_addr_unlock_bh(dev);
4723 }
4724 
4731 unsigned int dev_get_flags(const struct net_device *dev)
4732 {
4733  unsigned int flags;
4734 
4735  flags = (dev->flags & ~(IFF_PROMISC |
4736  IFF_ALLMULTI |
4737  IFF_RUNNING |
4738  IFF_LOWER_UP |
4739  IFF_DORMANT)) |
4740  (dev->gflags & (IFF_PROMISC |
4741  IFF_ALLMULTI));
4742 
4743  if (netif_running(dev)) {
4744  if (netif_oper_up(dev))
4745  flags |= IFF_RUNNING;
4746  if (netif_carrier_ok(dev))
4747  flags |= IFF_LOWER_UP;
4748  if (netif_dormant(dev))
4749  flags |= IFF_DORMANT;
4750  }
4751 
4752  return flags;
4753 }
4755 
4756 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4757 {
4758  unsigned int old_flags = dev->flags;
4759  int ret;
4760 
4761  ASSERT_RTNL();
4762 
4763  /*
4764  * Set the flags on our device.
4765  */
4766 
4767  dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4769  IFF_AUTOMEDIA)) |
4770  (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4771  IFF_ALLMULTI));
4772 
4773  /*
4774  * Load in the correct multicast list now the flags have changed.
4775  */
4776 
4777  if ((old_flags ^ flags) & IFF_MULTICAST)
4778  dev_change_rx_flags(dev, IFF_MULTICAST);
4779 
4780  dev_set_rx_mode(dev);
4781 
4782  /*
4783  * Have we downed the interface. We handle IFF_UP ourselves
4784  * according to user attempts to set it, rather than blindly
4785  * setting it.
4786  */
4787 
4788  ret = 0;
4789  if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4790  ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4791 
4792  if (!ret)
4793  dev_set_rx_mode(dev);
4794  }
4795 
4796  if ((flags ^ dev->gflags) & IFF_PROMISC) {
4797  int inc = (flags & IFF_PROMISC) ? 1 : -1;
4798 
4799  dev->gflags ^= IFF_PROMISC;
4800  dev_set_promiscuity(dev, inc);
4801  }
4802 
4803  /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4804  is important. Some (broken) drivers set IFF_PROMISC, when
4805  IFF_ALLMULTI is requested not asking us and not reporting.
4806  */
4807  if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4808  int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4809 
4810  dev->gflags ^= IFF_ALLMULTI;
4811  dev_set_allmulti(dev, inc);
4812  }
4813 
4814  return ret;
4815 }
4816 
4817 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4818 {
4819  unsigned int changes = dev->flags ^ old_flags;
4820 
4821  if (changes & IFF_UP) {
4822  if (dev->flags & IFF_UP)
4824  else
4826  }
4827 
4828  if (dev->flags & IFF_UP &&
4829  (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4831 }
4832 
4841 int dev_change_flags(struct net_device *dev, unsigned int flags)
4842 {
4843  int ret;
4844  unsigned int changes, old_flags = dev->flags;
4845 
4846  ret = __dev_change_flags(dev, flags);
4847  if (ret < 0)
4848  return ret;
4849 
4850  changes = old_flags ^ dev->flags;
4851  if (changes)
4852  rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4853 
4854  __dev_notify_flags(dev, old_flags);
4855  return ret;
4856 }
4858 
4866 int dev_set_mtu(struct net_device *dev, int new_mtu)
4867 {
4868  const struct net_device_ops *ops = dev->netdev_ops;
4869  int err;
4870 
4871  if (new_mtu == dev->mtu)
4872  return 0;
4873 
4874  /* MTU must be positive. */
4875  if (new_mtu < 0)
4876  return -EINVAL;
4877 
4878  if (!netif_device_present(dev))
4879  return -ENODEV;
4880 
4881  err = 0;
4882  if (ops->ndo_change_mtu)
4883  err = ops->ndo_change_mtu(dev, new_mtu);
4884  else
4885  dev->mtu = new_mtu;
4886 
4887  if (!err && dev->flags & IFF_UP)
4889  return err;
4890 }
4892 
4898 void dev_set_group(struct net_device *dev, int new_group)
4899 {
4900  dev->group = new_group;
4901 }
4903 
4911 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4912 {
4913  const struct net_device_ops *ops = dev->netdev_ops;
4914  int err;
4915 
4916  if (!ops->ndo_set_mac_address)
4917  return -EOPNOTSUPP;
4918  if (sa->sa_family != dev->type)
4919  return -EINVAL;
4920  if (!netif_device_present(dev))
4921  return -ENODEV;
4922  err = ops->ndo_set_mac_address(dev, sa);
4923  if (!err)
4926  return err;
4927 }
4929 
4930 /*
4931  * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4932  */
4933 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4934 {
4935  int err;
4936  struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4937 
4938  if (!dev)
4939  return -ENODEV;
4940 
4941  switch (cmd) {
4942  case SIOCGIFFLAGS: /* Get interface flags */
4943  ifr->ifr_flags = (short) dev_get_flags(dev);
4944  return 0;
4945 
4946  case SIOCGIFMETRIC: /* Get the metric on the interface
4947  (currently unused) */
4948  ifr->ifr_metric = 0;
4949  return 0;
4950 
4951  case SIOCGIFMTU: /* Get the MTU of a device */
4952  ifr->ifr_mtu = dev->mtu;
4953  return 0;
4954 
4955  case SIOCGIFHWADDR:
4956  if (!dev->addr_len)
4957  memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4958  else
4959  memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4960  min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4961  ifr->ifr_hwaddr.sa_family = dev->type;
4962  return 0;
4963 
4964  case SIOCGIFSLAVE:
4965  err = -EINVAL;
4966  break;
4967 
4968  case SIOCGIFMAP:
4969  ifr->ifr_map.mem_start = dev->mem_start;
4970  ifr->ifr_map.mem_end = dev->mem_end;
4971  ifr->ifr_map.base_addr = dev->base_addr;
4972  ifr->ifr_map.irq = dev->irq;
4973  ifr->ifr_map.dma = dev->dma;
4974  ifr->ifr_map.port = dev->if_port;
4975  return 0;
4976 
4977  case SIOCGIFINDEX:
4978  ifr->ifr_ifindex = dev->ifindex;
4979  return 0;
4980 
4981  case SIOCGIFTXQLEN:
4982  ifr->ifr_qlen = dev->tx_queue_len;
4983  return 0;
4984 
4985  default:
4986  /* dev_ioctl() should ensure this case
4987  * is never reached
4988  */
4989  WARN_ON(1);
4990  err = -ENOTTY;
4991  break;
4992 
4993  }
4994  return err;
4995 }
4996 
4997 /*
4998  * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4999  */
5000 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5001 {
5002  int err;
5003  struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5004  const struct net_device_ops *ops;
5005 
5006  if (!dev)
5007  return -ENODEV;
5008 
5009  ops = dev->netdev_ops;
5010 
5011  switch (cmd) {
5012  case SIOCSIFFLAGS: /* Set interface flags */
5013  return dev_change_flags(dev, ifr->ifr_flags);
5014 
5015  case SIOCSIFMETRIC: /* Set the metric on the interface
5016  (currently unused) */
5017  return -EOPNOTSUPP;
5018 
5019  case SIOCSIFMTU: /* Set the MTU of a device */
5020  return dev_set_mtu(dev, ifr->ifr_mtu);
5021 
5022  case SIOCSIFHWADDR:
5023  return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5024 
5025  case SIOCSIFHWBROADCAST:
5026  if (ifr->ifr_hwaddr.sa_family != dev->type)
5027  return -EINVAL;
5028  memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5029  min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5031  return 0;
5032 
5033  case SIOCSIFMAP:
5034  if (ops->ndo_set_config) {
5035  if (!netif_device_present(dev))
5036  return -ENODEV;
5037  return ops->ndo_set_config(dev, &ifr->ifr_map);
5038  }
5039  return -EOPNOTSUPP;
5040 
5041  case SIOCADDMULTI:
5042  if (!ops->ndo_set_rx_mode ||
5043  ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5044  return -EINVAL;
5045  if (!netif_device_present(dev))
5046  return -ENODEV;
5047  return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5048 
5049  case SIOCDELMULTI:
5050  if (!ops->ndo_set_rx_mode ||
5051  ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5052  return -EINVAL;
5053  if (!netif_device_present(dev))
5054  return -ENODEV;
5055  return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5056 
5057  case SIOCSIFTXQLEN:
5058  if (ifr->ifr_qlen < 0)
5059  return -EINVAL;
5060  dev->tx_queue_len = ifr->ifr_qlen;
5061  return 0;
5062 
5063  case SIOCSIFNAME:
5064  ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5065  return dev_change_name(dev, ifr->ifr_newname);
5066 
5067  case SIOCSHWTSTAMP:
5068  err = net_hwtstamp_validate(ifr);
5069  if (err)
5070  return err;
5071  /* fall through */
5072 
5073  /*
5074  * Unknown or private ioctl
5075  */
5076  default:
5077  if ((cmd >= SIOCDEVPRIVATE &&
5078  cmd <= SIOCDEVPRIVATE + 15) ||
5079  cmd == SIOCBONDENSLAVE ||
5080  cmd == SIOCBONDRELEASE ||
5081  cmd == SIOCBONDSETHWADDR ||
5082  cmd == SIOCBONDSLAVEINFOQUERY ||
5083  cmd == SIOCBONDINFOQUERY ||
5084  cmd == SIOCBONDCHANGEACTIVE ||
5085  cmd == SIOCGMIIPHY ||
5086  cmd == SIOCGMIIREG ||
5087  cmd == SIOCSMIIREG ||
5088  cmd == SIOCBRADDIF ||
5089  cmd == SIOCBRDELIF ||
5090  cmd == SIOCSHWTSTAMP ||
5091  cmd == SIOCWANDEV) {
5092  err = -EOPNOTSUPP;
5093  if (ops->ndo_do_ioctl) {
5094  if (netif_device_present(dev))
5095  err = ops->ndo_do_ioctl(dev, ifr, cmd);
5096  else
5097  err = -ENODEV;
5098  }
5099  } else
5100  err = -EINVAL;
5101 
5102  }
5103  return err;
5104 }
5105 
5106 /*
5107  * This function handles all "interface"-type I/O control requests. The actual
5108  * 'doing' part of this is dev_ifsioc above.
5109  */
5110 
5123 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5124 {
5125  struct ifreq ifr;
5126  int ret;
5127  char *colon;
5128 
5129  /* One special case: SIOCGIFCONF takes ifconf argument
5130  and requires shared lock, because it sleeps writing
5131  to user space.
5132  */
5133 
5134  if (cmd == SIOCGIFCONF) {
5135  rtnl_lock();
5136  ret = dev_ifconf(net, (char __user *) arg);
5137  rtnl_unlock();
5138  return ret;
5139  }
5140  if (cmd == SIOCGIFNAME)
5141  return dev_ifname(net, (struct ifreq __user *)arg);
5142 
5143  if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5144  return -EFAULT;
5145 
5146  ifr.ifr_name[IFNAMSIZ-1] = 0;
5147 
5148  colon = strchr(ifr.ifr_name, ':');
5149  if (colon)
5150  *colon = 0;
5151 
5152  /*
5153  * See which interface the caller is talking about.
5154  */
5155 
5156  switch (cmd) {
5157  /*
5158  * These ioctl calls:
5159  * - can be done by all.
5160  * - atomic and do not require locking.
5161  * - return a value
5162  */
5163  case SIOCGIFFLAGS:
5164  case SIOCGIFMETRIC:
5165  case SIOCGIFMTU:
5166  case SIOCGIFHWADDR:
5167  case SIOCGIFSLAVE:
5168  case SIOCGIFMAP:
5169  case SIOCGIFINDEX:
5170  case SIOCGIFTXQLEN:
5171  dev_load(net, ifr.ifr_name);
5172  rcu_read_lock();
5173  ret = dev_ifsioc_locked(net, &ifr, cmd);
5174  rcu_read_unlock();
5175  if (!ret) {
5176  if (colon)
5177  *colon = ':';
5178  if (copy_to_user(arg, &ifr,
5179  sizeof(struct ifreq)))
5180  ret = -EFAULT;
5181  }
5182  return ret;
5183 
5184  case SIOCETHTOOL:
5185  dev_load(net, ifr.ifr_name);
5186  rtnl_lock();
5187  ret = dev_ethtool(net, &ifr);
5188  rtnl_unlock();
5189  if (!ret) {
5190  if (colon)
5191  *colon = ':';
5192  if (copy_to_user(arg, &ifr,
5193  sizeof(struct ifreq)))
5194  ret = -EFAULT;
5195  }
5196  return ret;
5197 
5198  /*
5199  * These ioctl calls:
5200  * - require superuser power.
5201  * - require strict serialization.
5202  * - return a value
5203  */
5204  case SIOCGMIIPHY:
5205  case SIOCGMIIREG:
5206  case SIOCSIFNAME:
5207  if (!capable(CAP_NET_ADMIN))
5208  return -EPERM;
5209  dev_load(net, ifr.ifr_name);
5210  rtnl_lock();
5211  ret = dev_ifsioc(net, &ifr, cmd);
5212  rtnl_unlock();
5213  if (!ret) {
5214  if (colon)
5215  *colon = ':';
5216  if (copy_to_user(arg, &ifr,
5217  sizeof(struct ifreq)))
5218  ret = -EFAULT;
5219  }
5220  return ret;
5221 
5222  /*
5223  * These ioctl calls:
5224  * - require superuser power.
5225  * - require strict serialization.
5226  * - do not return a value
5227  */
5228  case SIOCSIFFLAGS:
5229  case SIOCSIFMETRIC:
5230  case SIOCSIFMTU:
5231  case SIOCSIFMAP:
5232  case SIOCSIFHWADDR:
5233  case SIOCSIFSLAVE:
5234  case SIOCADDMULTI:
5235  case SIOCDELMULTI:
5236  case SIOCSIFHWBROADCAST:
5237  case SIOCSIFTXQLEN:
5238  case SIOCSMIIREG:
5239  case SIOCBONDENSLAVE:
5240  case SIOCBONDRELEASE:
5241  case SIOCBONDSETHWADDR:
5242  case SIOCBONDCHANGEACTIVE:
5243  case SIOCBRADDIF:
5244  case SIOCBRDELIF:
5245  case SIOCSHWTSTAMP:
5246  if (!capable(CAP_NET_ADMIN))
5247  return -EPERM;
5248  /* fall through */
5250  case SIOCBONDINFOQUERY:
5251  dev_load(net, ifr.ifr_name);
5252  rtnl_lock();
5253  ret = dev_ifsioc(net, &ifr, cmd);
5254  rtnl_unlock();
5255  return ret;
5256 
5257  case SIOCGIFMEM:
5258  /* Get the per device memory space. We can add this but
5259  * currently do not support it */
5260  case SIOCSIFMEM:
5261  /* Set the per device memory buffer space.
5262  * Not applicable in our case */
5263  case SIOCSIFLINK:
5264  return -ENOTTY;
5265 
5266  /*
5267  * Unknown or private ioctl.
5268  */
5269  default:
5270  if (cmd == SIOCWANDEV ||
5271  (cmd >= SIOCDEVPRIVATE &&
5272  cmd <= SIOCDEVPRIVATE + 15)) {
5273  dev_load(net, ifr.ifr_name);
5274  rtnl_lock();
5275  ret = dev_ifsioc(net, &ifr, cmd);
5276  rtnl_unlock();
5277  if (!ret && copy_to_user(arg, &ifr,
5278  sizeof(struct ifreq)))
5279  ret = -EFAULT;
5280  return ret;
5281  }
5282  /* Take care of Wireless Extensions */
5283  if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5284  return wext_handle_ioctl(net, &ifr, cmd, arg);
5285  return -ENOTTY;
5286  }
5287 }
5288 
5289 
5298 static int dev_new_index(struct net *net)
5299 {
5300  int ifindex = net->ifindex;
5301  for (;;) {
5302  if (++ifindex <= 0)
5303  ifindex = 1;
5304  if (!__dev_get_by_index(net, ifindex))
5305  return net->ifindex = ifindex;
5306  }
5307 }
5308 
5309 /* Delayed registration/unregisteration */
5310 static LIST_HEAD(net_todo_list);
5311 
5312 static void net_set_todo(struct net_device *dev)
5313 {
5314  list_add_tail(&dev->todo_list, &net_todo_list);
5315 }
5316 
5317 static void rollback_registered_many(struct list_head *head)
5318 {
5319  struct net_device *dev, *tmp;
5320 
5321  BUG_ON(dev_boot_phase);
5322  ASSERT_RTNL();
5323 
5324  list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5325  /* Some devices call without registering
5326  * for initialization unwind. Remove those
5327  * devices and proceed with the remaining.
5328  */
5329  if (dev->reg_state == NETREG_UNINITIALIZED) {
5330  pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5331  dev->name, dev);
5332 
5333  WARN_ON(1);
5334  list_del(&dev->unreg_list);
5335  continue;
5336  }
5337  dev->dismantle = true;
5339  }
5340 
5341  /* If device is running, close it first. */
5342  dev_close_many(head);
5343 
5344  list_for_each_entry(dev, head, unreg_list) {
5345  /* And unlink it from device chain. */
5346  unlist_netdevice(dev);
5347 
5349  }
5350 
5351  synchronize_net();
5352 
5353  list_for_each_entry(dev, head, unreg_list) {
5354  /* Shutdown queueing discipline. */
5355  dev_shutdown(dev);
5356 
5357 
5358  /* Notify protocols, that we are about to destroy
5359  this device. They should clean all the things.
5360  */
5362 
5363  if (!dev->rtnl_link_ops ||
5365  rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5366 
5367  /*
5368  * Flush the unicast and multicast chains
5369  */
5370  dev_uc_flush(dev);
5371  dev_mc_flush(dev);
5372 
5373  if (dev->netdev_ops->ndo_uninit)
5374  dev->netdev_ops->ndo_uninit(dev);
5375 
5376  /* Notifier chain MUST detach us from master device. */
5377  WARN_ON(dev->master);
5378 
5379  /* Remove entries from kobject tree */
5381  }
5382 
5383  synchronize_net();
5384 
5385  list_for_each_entry(dev, head, unreg_list)
5386  dev_put(dev);
5387 }
5388 
5389 static void rollback_registered(struct net_device *dev)
5390 {
5391  LIST_HEAD(single);
5392 
5393  list_add(&dev->unreg_list, &single);
5394  rollback_registered_many(&single);
5395  list_del(&single);
5396 }
5397 
5398 static netdev_features_t netdev_fix_features(struct net_device *dev,
5399  netdev_features_t features)
5400 {
5401  /* Fix illegal checksum combinations */
5402  if ((features & NETIF_F_HW_CSUM) &&
5403  (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5404  netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5405  features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5406  }
5407 
5408  /* Fix illegal SG+CSUM combinations. */
5409  if ((features & NETIF_F_SG) &&
5410  !(features & NETIF_F_ALL_CSUM)) {
5411  netdev_dbg(dev,
5412  "Dropping NETIF_F_SG since no checksum feature.\n");
5413  features &= ~NETIF_F_SG;
5414  }
5415 
5416  /* TSO requires that SG is present as well. */
5417  if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5418  netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5419  features &= ~NETIF_F_ALL_TSO;
5420  }
5421 
5422  /* TSO ECN requires that TSO is present as well. */
5423  if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5424  features &= ~NETIF_F_TSO_ECN;
5425 
5426  /* Software GSO depends on SG. */
5427  if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5428  netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5429  features &= ~NETIF_F_GSO;
5430  }
5431 
5432  /* UFO needs SG and checksumming */
5433  if (features & NETIF_F_UFO) {
5434  /* maybe split UFO into V4 and V6? */
5435  if (!((features & NETIF_F_GEN_CSUM) ||
5436  (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5438  netdev_dbg(dev,
5439  "Dropping NETIF_F_UFO since no checksum offload features.\n");
5440  features &= ~NETIF_F_UFO;
5441  }
5442 
5443  if (!(features & NETIF_F_SG)) {
5444  netdev_dbg(dev,
5445  "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5446  features &= ~NETIF_F_UFO;
5447  }
5448  }
5449 
5450  return features;
5451 }
5452 
5454 {
5456  int err = 0;
5457 
5458  ASSERT_RTNL();
5459 
5460  features = netdev_get_wanted_features(dev);
5461 
5462  if (dev->netdev_ops->ndo_fix_features)
5463  features = dev->netdev_ops->ndo_fix_features(dev, features);
5464 
5465  /* driver might be less strict about feature dependencies */
5466  features = netdev_fix_features(dev, features);
5467 
5468  if (dev->features == features)
5469  return 0;
5470 
5471  netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5472  &dev->features, &features);
5473 
5474  if (dev->netdev_ops->ndo_set_features)
5475  err = dev->netdev_ops->ndo_set_features(dev, features);
5476 
5477  if (unlikely(err < 0)) {
5478  netdev_err(dev,
5479  "set_features() failed (%d); wanted %pNF, left %pNF\n",
5480  err, &features, &dev->features);
5481  return -1;
5482  }
5483 
5484  if (!err)
5485  dev->features = features;
5486 
5487  return 1;
5488 }
5489 
5499 {
5500  if (__netdev_update_features(dev))
5502 }
5504 
5516 {
5519 }
5521 
5532  struct net_device *dev)
5533 {
5534  if (rootdev->operstate == IF_OPER_DORMANT)
5535  netif_dormant_on(dev);
5536  else
5537  netif_dormant_off(dev);
5538 
5539  if (netif_carrier_ok(rootdev)) {
5540  if (!netif_carrier_ok(dev))
5541  netif_carrier_on(dev);
5542  } else {
5543  if (netif_carrier_ok(dev))
5544  netif_carrier_off(dev);
5545  }
5546 }
5548 
5549 #ifdef CONFIG_RPS
5550 static int netif_alloc_rx_queues(struct net_device *dev)
5551 {
5552  unsigned int i, count = dev->num_rx_queues;
5553  struct netdev_rx_queue *rx;
5554 
5555  BUG_ON(count < 1);
5556 
5557  rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5558  if (!rx) {
5559  pr_err("netdev: Unable to allocate %u rx queues\n", count);
5560  return -ENOMEM;
5561  }
5562  dev->_rx = rx;
5563 
5564  for (i = 0; i < count; i++)
5565  rx[i].dev = dev;
5566  return 0;
5567 }
5568 #endif
5569 
5570 static void netdev_init_one_queue(struct net_device *dev,
5571  struct netdev_queue *queue, void *_unused)
5572 {
5573  /* Initialize queue lock */
5574  spin_lock_init(&queue->_xmit_lock);
5575  netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5576  queue->xmit_lock_owner = -1;
5577  netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5578  queue->dev = dev;
5579 #ifdef CONFIG_BQL
5580  dql_init(&queue->dql, HZ);
5581 #endif
5582 }
5583 
5584 static int netif_alloc_netdev_queues(struct net_device *dev)
5585 {
5586  unsigned int count = dev->num_tx_queues;
5587  struct netdev_queue *tx;
5588 
5589  BUG_ON(count < 1);
5590 
5591  tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5592  if (!tx) {
5593  pr_err("netdev: Unable to allocate %u tx queues\n", count);
5594  return -ENOMEM;
5595  }
5596  dev->_tx = tx;
5597 
5598  netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5600 
5601  return 0;
5602 }
5603 
5622 {
5623  int ret;
5624  struct net *net = dev_net(dev);
5625 
5626  BUG_ON(dev_boot_phase);
5627  ASSERT_RTNL();
5628 
5629  might_sleep();
5630 
5631  /* When net_device's are persistent, this will be fatal. */
5632  BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5633  BUG_ON(!net);
5634 
5636  netdev_set_addr_lockdep_class(dev);
5637 
5638  dev->iflink = -1;
5639 
5640  ret = dev_get_valid_name(net, dev, dev->name);
5641  if (ret < 0)
5642  goto out;
5643 
5644  /* Init, if this function is available */
5645  if (dev->netdev_ops->ndo_init) {
5646  ret = dev->netdev_ops->ndo_init(dev);
5647  if (ret) {
5648  if (ret > 0)
5649  ret = -EIO;
5650  goto out;
5651  }
5652  }
5653 
5654  ret = -EBUSY;
5655  if (!dev->ifindex)
5656  dev->ifindex = dev_new_index(net);
5657  else if (__dev_get_by_index(net, dev->ifindex))
5658  goto err_uninit;
5659 
5660  if (dev->iflink == -1)
5661  dev->iflink = dev->ifindex;
5662 
5663  /* Transfer changeable features to wanted_features and enable
5664  * software offloads (GSO and GRO).
5665  */
5668  dev->wanted_features = dev->features & dev->hw_features;
5669 
5670  /* Turn on no cache copy if HW is doing checksum */
5671  if (!(dev->flags & IFF_LOOPBACK)) {
5673  if (dev->features & NETIF_F_ALL_CSUM) {
5676  }
5677  }
5678 
5679  /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5680  */
5682 
5684  ret = notifier_to_errno(ret);
5685  if (ret)
5686  goto err_uninit;
5687 
5688  ret = netdev_register_kobject(dev);
5689  if (ret)
5690  goto err_uninit;
5691  dev->reg_state = NETREG_REGISTERED;
5692 
5694 
5695  /*
5696  * Default initial state at registry is that the
5697  * device is present.
5698  */
5699 
5701 
5702  linkwatch_init_dev(dev);
5703 
5704  dev_init_scheduler(dev);
5705  dev_hold(dev);
5706  list_netdevice(dev);
5708 
5709  /* Notify protocols, that a new device appeared. */
5711  ret = notifier_to_errno(ret);
5712  if (ret) {
5713  rollback_registered(dev);
5714  dev->reg_state = NETREG_UNREGISTERED;
5715  }
5716  /*
5717  * Prevent userspace races by waiting until the network
5718  * device is fully setup before sending notifications.
5719  */
5720  if (!dev->rtnl_link_ops ||
5721  dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5722  rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5723 
5724 out:
5725  return ret;
5726 
5727 err_uninit:
5728  if (dev->netdev_ops->ndo_uninit)
5729  dev->netdev_ops->ndo_uninit(dev);
5730  goto out;
5731 }
5733 
5745 {
5746  /* Clear everything. Note we don't initialize spinlocks
5747  * are they aren't supposed to be taken by any of the
5748  * NAPI code and this dummy netdev is supposed to be
5749  * only ever used for NAPI polls
5750  */
5751  memset(dev, 0, sizeof(struct net_device));
5752 
5753  /* make sure we BUG if trying to hit standard
5754  * register/unregister code path
5755  */
5756  dev->reg_state = NETREG_DUMMY;
5757 
5758  /* NAPI wants this */
5759  INIT_LIST_HEAD(&dev->napi_list);
5760 
5761  /* a dummy interface is started by default */
5764 
5765  /* Note : We dont allocate pcpu_refcnt for dummy devices,
5766  * because users of this 'device' dont need to change
5767  * its refcount.
5768  */
5769 
5770  return 0;
5771 }
5773 
5774 
5788 int register_netdev(struct net_device *dev)
5789 {
5790  int err;
5791 
5792  rtnl_lock();
5793  err = register_netdevice(dev);
5794  rtnl_unlock();
5795  return err;
5796 }
5798 
5799 int netdev_refcnt_read(const struct net_device *dev)
5800 {
5801  int i, refcnt = 0;
5802 
5804  refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5805  return refcnt;
5806 }
5808 
5821 static void netdev_wait_allrefs(struct net_device *dev)
5822 {
5823  unsigned long rebroadcast_time, warning_time;
5824  int refcnt;
5825 
5826  linkwatch_forget_dev(dev);
5827 
5828  rebroadcast_time = warning_time = jiffies;
5829  refcnt = netdev_refcnt_read(dev);
5830 
5831  while (refcnt != 0) {
5832  if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5833  rtnl_lock();
5834 
5835  /* Rebroadcast unregister notification */
5837 
5838  __rtnl_unlock();
5839  rcu_barrier();
5840  rtnl_lock();
5841 
5844  &dev->state)) {
5845  /* We must not have linkwatch events
5846  * pending on unregister. If this
5847  * happens, we simply run the queue
5848  * unscheduled, resulting in a noop
5849  * for this device.
5850  */
5852  }
5853 
5854  __rtnl_unlock();
5855 
5856  rebroadcast_time = jiffies;
5857  }
5858 
5859  msleep(250);
5860 
5861  refcnt = netdev_refcnt_read(dev);
5862 
5863  if (time_after(jiffies, warning_time + 10 * HZ)) {
5864  pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5865  dev->name, refcnt);
5866  warning_time = jiffies;
5867  }
5868  }
5869 }
5870 
5871 /* The sequence is:
5872  *
5873  * rtnl_lock();
5874  * ...
5875  * register_netdevice(x1);
5876  * register_netdevice(x2);
5877  * ...
5878  * unregister_netdevice(y1);
5879  * unregister_netdevice(y2);
5880  * ...
5881  * rtnl_unlock();
5882  * free_netdev(y1);
5883  * free_netdev(y2);
5884  *
5885  * We are invoked by rtnl_unlock().
5886  * This allows us to deal with problems:
5887  * 1) We can delete sysfs objects which invoke hotplug
5888  * without deadlocking with linkwatch via keventd.
5889  * 2) Since we run with the RTNL semaphore not held, we can sleep
5890  * safely in order to wait for the netdev refcnt to drop to zero.
5891  *
5892  * We must not return until all unregister events added during
5893  * the interval the lock was held have been completed.
5894  */
5896 {
5897  struct list_head list;
5898 
5899  /* Snapshot list, allow later requests */
5900  list_replace_init(&net_todo_list, &list);
5901 
5902  __rtnl_unlock();
5903 
5904 
5905  /* Wait for rcu callbacks to finish before next phase */
5906  if (!list_empty(&list))
5907  rcu_barrier();
5908 
5909  while (!list_empty(&list)) {
5910  struct net_device *dev
5911  = list_first_entry(&list, struct net_device, todo_list);
5912  list_del(&dev->todo_list);
5913 
5914  rtnl_lock();
5916  __rtnl_unlock();
5917 
5918  if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5919  pr_err("network todo '%s' but state %d\n",
5920  dev->name, dev->reg_state);
5921  dump_stack();
5922  continue;
5923  }
5924 
5926 
5927  on_each_cpu(flush_backlog, dev, 1);
5928 
5929  netdev_wait_allrefs(dev);
5930 
5931  /* paranoia */
5932  BUG_ON(netdev_refcnt_read(dev));
5935  WARN_ON(dev->dn_ptr);
5936 
5937  if (dev->destructor)
5938  dev->destructor(dev);
5939 
5940  /* Free network device */
5941  kobject_put(&dev->dev.kobj);
5942  }
5943 }
5944 
5945 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5946  * fields in the same order, with only the type differing.
5947  */
5949  const struct net_device_stats *netdev_stats)
5950 {
5951 #if BITS_PER_LONG == 64
5952  BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5953  memcpy(stats64, netdev_stats, sizeof(*stats64));
5954 #else
5955  size_t i, n = sizeof(*stats64) / sizeof(u64);
5956  const unsigned long *src = (const unsigned long *)netdev_stats;
5957  u64 *dst = (u64 *)stats64;
5958 
5959  BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5960  sizeof(*stats64) / sizeof(u64));
5961  for (i = 0; i < n; i++)
5962  dst[i] = src[i];
5963 #endif
5964 }
5966 
5978  struct rtnl_link_stats64 *storage)
5979 {
5980  const struct net_device_ops *ops = dev->netdev_ops;
5981 
5982  if (ops->ndo_get_stats64) {
5983  memset(storage, 0, sizeof(*storage));
5984  ops->ndo_get_stats64(dev, storage);
5985  } else if (ops->ndo_get_stats) {
5986  netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5987  } else {
5988  netdev_stats_to_stats64(storage, &dev->stats);
5989  }
5990  storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5991  return storage;
5992 }
5994 
5996 {
5997  struct netdev_queue *queue = dev_ingress_queue(dev);
5998 
5999 #ifdef CONFIG_NET_CLS_ACT
6000  if (queue)
6001  return queue;
6002  queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6003  if (!queue)
6004  return NULL;
6005  netdev_init_one_queue(dev, queue, NULL);
6006  queue->qdisc = &noop_qdisc;
6007  queue->qdisc_sleeping = &noop_qdisc;
6008  rcu_assign_pointer(dev->ingress_queue, queue);
6009 #endif
6010  return queue;
6011 }
6012 
6013 static const struct ethtool_ops default_ethtool_ops;
6014 
6027 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6028  void (*setup)(struct net_device *),
6029  unsigned int txqs, unsigned int rxqs)
6030 {
6031  struct net_device *dev;
6032  size_t alloc_size;
6033  struct net_device *p;
6034 
6035  BUG_ON(strlen(name) >= sizeof(dev->name));
6036 
6037  if (txqs < 1) {
6038  pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6039  return NULL;
6040  }
6041 
6042 #ifdef CONFIG_RPS
6043  if (rxqs < 1) {
6044  pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6045  return NULL;
6046  }
6047 #endif
6048 
6049  alloc_size = sizeof(struct net_device);
6050  if (sizeof_priv) {
6051  /* ensure 32-byte alignment of private area */
6052  alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6053  alloc_size += sizeof_priv;
6054  }
6055  /* ensure 32-byte alignment of whole construct */
6056  alloc_size += NETDEV_ALIGN - 1;
6057 
6058  p = kzalloc(alloc_size, GFP_KERNEL);
6059  if (!p) {
6060  pr_err("alloc_netdev: Unable to allocate device\n");
6061  return NULL;
6062  }
6063 
6064  dev = PTR_ALIGN(p, NETDEV_ALIGN);
6065  dev->padded = (char *)dev - (char *)p;
6066 
6067  dev->pcpu_refcnt = alloc_percpu(int);
6068  if (!dev->pcpu_refcnt)
6069  goto free_p;
6070 
6071  if (dev_addr_init(dev))
6072  goto free_pcpu;
6073 
6074  dev_mc_init(dev);
6075  dev_uc_init(dev);
6076 
6077  dev_net_set(dev, &init_net);
6078 
6079  dev->gso_max_size = GSO_MAX_SIZE;
6080  dev->gso_max_segs = GSO_MAX_SEGS;
6081 
6082  INIT_LIST_HEAD(&dev->napi_list);
6083  INIT_LIST_HEAD(&dev->unreg_list);
6084  INIT_LIST_HEAD(&dev->link_watch_list);
6086  setup(dev);
6087 
6088  dev->num_tx_queues = txqs;
6089  dev->real_num_tx_queues = txqs;
6090  if (netif_alloc_netdev_queues(dev))
6091  goto free_all;
6092 
6093 #ifdef CONFIG_RPS
6094  dev->num_rx_queues = rxqs;
6095  dev->real_num_rx_queues = rxqs;
6096  if (netif_alloc_rx_queues(dev))
6097  goto free_all;
6098 #endif
6099 
6100  strcpy(dev->name, name);
6101  dev->group = INIT_NETDEV_GROUP;
6102  if (!dev->ethtool_ops)
6103  dev->ethtool_ops = &default_ethtool_ops;
6104  return dev;
6105 
6106 free_all:
6107  free_netdev(dev);
6108  return NULL;
6109 
6110 free_pcpu:
6111  free_percpu(dev->pcpu_refcnt);
6112  kfree(dev->_tx);
6113 #ifdef CONFIG_RPS
6114  kfree(dev->_rx);
6115 #endif
6116 
6117 free_p:
6118  kfree(p);
6119  return NULL;
6120 }
6122 
6131 void free_netdev(struct net_device *dev)
6132 {
6133  struct napi_struct *p, *n;
6134 
6135  release_net(dev_net(dev));
6136 
6137  kfree(dev->_tx);
6138 #ifdef CONFIG_RPS
6139  kfree(dev->_rx);
6140 #endif
6141 
6143 
6144  /* Flush device addresses */
6145  dev_addr_flush(dev);
6146 
6148  netif_napi_del(p);
6149 
6150  free_percpu(dev->pcpu_refcnt);
6151  dev->pcpu_refcnt = NULL;
6152 
6153  /* Compatibility with error handling in drivers */
6154  if (dev->reg_state == NETREG_UNINITIALIZED) {
6155  kfree((char *)dev - dev->padded);
6156  return;
6157  }
6158 
6159  BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6160  dev->reg_state = NETREG_RELEASED;
6161 
6162  /* will free via device release */
6163  put_device(&dev->dev);
6164 }
6166 
6174 {
6175  might_sleep();
6176  if (rtnl_is_locked())
6178  else
6179  synchronize_rcu();
6180 }
6182 
6196 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6197 {
6198  ASSERT_RTNL();
6199 
6200  if (head) {
6201  list_move_tail(&dev->unreg_list, head);
6202  } else {
6203  rollback_registered(dev);
6204  /* Finish processing unregister after unlock */
6205  net_set_todo(dev);
6206  }
6207 }
6209 
6215 {
6216  struct net_device *dev;
6217 
6218  if (!list_empty(head)) {
6219  rollback_registered_many(head);
6220  list_for_each_entry(dev, head, unreg_list)
6221  net_set_todo(dev);
6222  }
6223 }
6225 
6238 {
6239  rtnl_lock();
6240  unregister_netdevice(dev);
6241  rtnl_unlock();
6242 }
6244 
6259 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6260 {
6261  int err;
6262 
6263  ASSERT_RTNL();
6264 
6265  /* Don't allow namespace local devices to be moved. */
6266  err = -EINVAL;
6267  if (dev->features & NETIF_F_NETNS_LOCAL)
6268  goto out;
6269 
6270  /* Ensure the device has been registrered */
6271  err = -EINVAL;
6272  if (dev->reg_state != NETREG_REGISTERED)
6273  goto out;
6274 
6275  /* Get out if there is nothing todo */
6276  err = 0;
6277  if (net_eq(dev_net(dev), net))
6278  goto out;
6279 
6280  /* Pick the destination device name, and ensure
6281  * we can use it in the destination network namespace.
6282  */
6283  err = -EEXIST;
6284  if (__dev_get_by_name(net, dev->name)) {
6285  /* We get here if we can't use the current device name */
6286  if (!pat)
6287  goto out;
6288  if (dev_get_valid_name(net, dev, pat) < 0)
6289  goto out;
6290  }
6291 
6292  /*
6293  * And now a mini version of register_netdevice unregister_netdevice.
6294  */
6295 
6296  /* If device is running close it first. */
6297  dev_close(dev);
6298 
6299  /* And unlink it from device chain */
6300  err = -ENODEV;
6301  unlist_netdevice(dev);
6302 
6303  synchronize_net();
6304 
6305  /* Shutdown queueing discipline. */
6306  dev_shutdown(dev);
6307 
6308  /* Notify protocols, that we are about to destroy
6309  this device. They should clean all the things.
6310 
6311  Note that dev->reg_state stays at NETREG_REGISTERED.
6312  This is wanted because this way 8021q and macvlan know
6313  the device is just moving and can keep their slaves up.
6314  */
6316  rcu_barrier();
6318  rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6319 
6320  /*
6321  * Flush the unicast and multicast chains
6322  */
6323  dev_uc_flush(dev);
6324  dev_mc_flush(dev);
6325 
6326  /* Actually switch the network namespace */
6327  dev_net_set(dev, net);
6328 
6329  /* If there is an ifindex conflict assign a new one */
6330  if (__dev_get_by_index(net, dev->ifindex)) {
6331  int iflink = (dev->iflink == dev->ifindex);
6332  dev->ifindex = dev_new_index(net);
6333  if (iflink)
6334  dev->iflink = dev->ifindex;
6335  }
6336 
6337  /* Fixup kobjects */
6338  err = device_rename(&dev->dev, dev->name);
6339  WARN_ON(err);
6340 
6341  /* Add the device back in the hashes */
6342  list_netdevice(dev);
6343 
6344  /* Notify protocols, that a new device appeared. */
6346 
6347  /*
6348  * Prevent userspace races by waiting until the network
6349  * device is fully setup before sending notifications.
6350  */
6351  rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6352 
6353  synchronize_net();
6354  err = 0;
6355 out:
6356  return err;
6357 }
6359 
6360 static int dev_cpu_callback(struct notifier_block *nfb,
6361  unsigned long action,
6362  void *ocpu)
6363 {
6364  struct sk_buff **list_skb;
6365  struct sk_buff *skb;
6366  unsigned int cpu, oldcpu = (unsigned long)ocpu;
6367  struct softnet_data *sd, *oldsd;
6368 
6369  if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6370  return NOTIFY_OK;
6371 
6373  cpu = smp_processor_id();
6374  sd = &per_cpu(softnet_data, cpu);
6375  oldsd = &per_cpu(softnet_data, oldcpu);
6376 
6377  /* Find end of our completion_queue. */
6378  list_skb = &sd->completion_queue;
6379  while (*list_skb)
6380  list_skb = &(*list_skb)->next;
6381  /* Append completion queue from offline CPU. */
6382  *list_skb = oldsd->completion_queue;
6383  oldsd->completion_queue = NULL;
6384 
6385  /* Append output queue from offline CPU. */
6386  if (oldsd->output_queue) {
6387  *sd->output_queue_tailp = oldsd->output_queue;
6389  oldsd->output_queue = NULL;
6390  oldsd->output_queue_tailp = &oldsd->output_queue;
6391  }
6392  /* Append NAPI poll list from offline CPU. */
6393  if (!list_empty(&oldsd->poll_list)) {
6394  list_splice_init(&oldsd->poll_list, &sd->poll_list);
6396  }
6397 
6399  local_irq_enable();
6400 
6401  /* Process offline CPU's input_pkt_queue */
6402  while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6403  netif_rx(skb);
6404  input_queue_head_incr(oldsd);
6405  }
6406  while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6407  netif_rx(skb);
6408  input_queue_head_incr(oldsd);
6409  }
6410 
6411  return NOTIFY_OK;
6412 }
6413 
6414 
6427 {
6428  if (mask & NETIF_F_GEN_CSUM)
6429  mask |= NETIF_F_ALL_CSUM;
6430  mask |= NETIF_F_VLAN_CHALLENGED;
6431 
6432  all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6433  all &= one | ~NETIF_F_ALL_FOR_ALL;
6434 
6435  /* If one device supports hw checksumming, set for all. */
6436  if (all & NETIF_F_GEN_CSUM)
6437  all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6438 
6439  return all;
6440 }
6442 
6443 static struct hlist_head *netdev_create_hash(void)
6444 {
6445  int i;
6446  struct hlist_head *hash;
6447 
6448  hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6449  if (hash != NULL)
6450  for (i = 0; i < NETDEV_HASHENTRIES; i++)
6451  INIT_HLIST_HEAD(&hash[i]);
6452 
6453  return hash;
6454 }
6455 
6456 /* Initialize per network namespace state */
6457 static int __net_init netdev_init(struct net *net)
6458 {
6459  if (net != &init_net)
6460  INIT_LIST_HEAD(&net->dev_base_head);
6461 
6462  net->dev_name_head = netdev_create_hash();
6463  if (net->dev_name_head == NULL)
6464  goto err_name;
6465 
6466  net->dev_index_head = netdev_create_hash();
6467  if (net->dev_index_head == NULL)
6468  goto err_idx;
6469 
6470  return 0;
6471 
6472 err_idx:
6473  kfree(net->dev_name_head);
6474 err_name:
6475  return -ENOMEM;
6476 }
6477 
6484 const char *netdev_drivername(const struct net_device *dev)
6485 {
6486  const struct device_driver *driver;
6487  const struct device *parent;
6488  const char *empty = "";
6489 
6490  parent = dev->dev.parent;
6491  if (!parent)
6492  return empty;
6493 
6494  driver = parent->driver;
6495  if (driver && driver->name)
6496  return driver->name;
6497  return empty;
6498 }
6499 
6500 static int __netdev_printk(const char *level, const struct net_device *dev,
6501  struct va_format *vaf)
6502 {
6503  int r;
6504 
6505  if (dev && dev->dev.parent) {
6506  r = dev_printk_emit(level[1] - '0',
6507  dev->dev.parent,
6508  "%s %s %s: %pV",
6509  dev_driver_string(dev->dev.parent),
6510  dev_name(dev->dev.parent),
6511  netdev_name(dev), vaf);
6512  } else if (dev) {
6513  r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6514  } else {
6515  r = printk("%s(NULL net_device): %pV", level, vaf);
6516  }
6517 
6518  return r;
6519 }
6520 
6521 int netdev_printk(const char *level, const struct net_device *dev,
6522  const char *format, ...)
6523 {
6524  struct va_format vaf;
6525  va_list args;
6526  int r;
6527 
6528  va_start(args, format);
6529 
6530  vaf.fmt = format;
6531  vaf.va = &args;
6532 
6533  r = __netdev_printk(level, dev, &vaf);
6534 
6535  va_end(args);
6536 
6537  return r;
6538 }
6540 
6541 #define define_netdev_printk_level(func, level) \
6542 int func(const struct net_device *dev, const char *fmt, ...) \
6543 { \
6544  int r; \
6545  struct va_format vaf; \
6546  va_list args; \
6547  \
6548  va_start(args, fmt); \
6549  \
6550  vaf.fmt = fmt; \
6551  vaf.va = &args; \
6552  \
6553  r = __netdev_printk(level, dev, &vaf); \
6554  \
6555  va_end(args); \
6556  \
6557  return r; \
6558 } \
6559 EXPORT_SYMBOL(func);
6560 
6561 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6562 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6566 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6568 
6569 static void __net_exit netdev_exit(struct net *net)
6570 {
6571  kfree(net->dev_name_head);
6572  kfree(net->dev_index_head);
6573 }
6574 
6575 static struct pernet_operations __net_initdata netdev_net_ops = {
6576  .init = netdev_init,
6577  .exit = netdev_exit,
6578 };
6579 
6580 static void __net_exit default_device_exit(struct net *net)
6581 {
6582  struct net_device *dev, *aux;
6583  /*
6584  * Push all migratable network devices back to the
6585  * initial network namespace
6586  */
6587  rtnl_lock();
6588  for_each_netdev_safe(net, dev, aux) {
6589  int err;
6590  char fb_name[IFNAMSIZ];
6591 
6592  /* Ignore unmoveable devices (i.e. loopback) */
6593  if (dev->features & NETIF_F_NETNS_LOCAL)
6594  continue;
6595 
6596  /* Leave virtual devices for the generic cleanup */
6597  if (dev->rtnl_link_ops)
6598  continue;
6599 
6600  /* Push remaining network devices to init_net */
6601  snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6602  err = dev_change_net_namespace(dev, &init_net, fb_name);
6603  if (err) {
6604  pr_emerg("%s: failed to move %s to init_net: %d\n",
6605  __func__, dev->name, err);
6606  BUG();
6607  }
6608  }
6609  rtnl_unlock();
6610 }
6611 
6612 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6613 {
6614  /* At exit all network devices most be removed from a network
6615  * namespace. Do this in the reverse order of registration.
6616  * Do this across as many network namespaces as possible to
6617  * improve batching efficiency.
6618  */
6619  struct net_device *dev;
6620  struct net *net;
6621  LIST_HEAD(dev_kill_list);
6622 
6623  rtnl_lock();
6624  list_for_each_entry(net, net_list, exit_list) {
6625  for_each_netdev_reverse(net, dev) {
6626  if (dev->rtnl_link_ops)
6627  dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6628  else
6629  unregister_netdevice_queue(dev, &dev_kill_list);
6630  }
6631  }
6632  unregister_netdevice_many(&dev_kill_list);
6633  list_del(&dev_kill_list);
6634  rtnl_unlock();
6635 }
6636 
6637 static struct pernet_operations __net_initdata default_device_ops = {
6638  .exit = default_device_exit,
6639  .exit_batch = default_device_exit_batch,
6640 };
6641 
6642 /*
6643  * Initialize the DEV module. At boot time this walks the device list and
6644  * unhooks any devices that fail to initialise (normally hardware not
6645  * present) and leaves us with a valid list of present and active devices.
6646  *
6647  */
6648 
6649 /*
6650  * This is called single threaded during boot, so no need
6651  * to take the rtnl semaphore.
6652  */
6653 static int __init net_dev_init(void)
6654 {
6655  int i, rc = -ENOMEM;
6656 
6657  BUG_ON(!dev_boot_phase);
6658 
6659  if (dev_proc_init())
6660  goto out;
6661 
6662  if (netdev_kobject_init())
6663  goto out;
6664 
6665  INIT_LIST_HEAD(&ptype_all);
6666  for (i = 0; i < PTYPE_HASH_SIZE; i++)
6667  INIT_LIST_HEAD(&ptype_base[i]);
6668 
6669  if (register_pernet_subsys(&netdev_net_ops))
6670  goto out;
6671 
6672  /*
6673  * Initialise the packet receive queues.
6674  */
6675 
6677  struct softnet_data *sd = &per_cpu(softnet_data, i);
6678 
6679  memset(sd, 0, sizeof(*sd));
6680  skb_queue_head_init(&sd->input_pkt_queue);
6681  skb_queue_head_init(&sd->process_queue);
6682  sd->completion_queue = NULL;
6683  INIT_LIST_HEAD(&sd->poll_list);
6684  sd->output_queue = NULL;
6685  sd->output_queue_tailp = &sd->output_queue;
6686 #ifdef CONFIG_RPS
6687  sd->csd.func = rps_trigger_softirq;
6688  sd->csd.info = sd;
6689  sd->csd.flags = 0;
6690  sd->cpu = i;
6691 #endif
6692 
6693  sd->backlog.poll = process_backlog;
6694  sd->backlog.weight = weight_p;
6695  sd->backlog.gro_list = NULL;
6696  sd->backlog.gro_count = 0;
6697  }
6698 
6699  dev_boot_phase = 0;
6700 
6701  /* The loopback device is special if any other network devices
6702  * is present in a network namespace the loopback device must
6703  * be present. Since we now dynamically allocate and free the
6704  * loopback device ensure this invariant is maintained by
6705  * keeping the loopback device as the first device on the
6706  * list of network devices. Ensuring the loopback devices
6707  * is the first device that appears and the last network device
6708  * that disappears.
6709  */
6711  goto out;
6712 
6713  if (register_pernet_device(&default_device_ops))
6714  goto out;
6715 
6716  open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6717  open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6718 
6719  hotcpu_notifier(dev_cpu_callback, 0);
6720  dst_init();
6721  dev_mcast_init();
6722  rc = 0;
6723 out:
6724  return rc;
6725 }
6726 
6727 subsys_initcall(net_dev_init);
6728 
6729 static int __init initialize_hashrnd(void)
6730 {
6731  get_random_bytes(&hashrnd, sizeof(hashrnd));
6732  return 0;
6733 }
6734 
6735 late_initcall_sync(initialize_hashrnd);
6736