Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
sch_api.c
Go to the documentation of this file.
1 /*
2  * net/sched/sch_api.c Packet scheduler API.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  *
9  * Authors: Alexey Kuznetsov, <[email protected]>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <[email protected]> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <[email protected]> :990222: kmod support
15  * Jamal Hadi Salim <[email protected]>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39  struct nlmsghdr *n, u32 clid,
40  struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42  struct nlmsghdr *n, struct Qdisc *q,
43  unsigned long cl, int event);
44 
45 /*
46 
47  Short review.
48  -------------
49 
50  This file consists of two interrelated parts:
51 
52  1. queueing disciplines manager frontend.
53  2. traffic classes manager frontend.
54 
55  Generally, queueing discipline ("qdisc") is a black box,
56  which is able to enqueue packets and to dequeue them (when
57  device is ready to send something) in order and at times
58  determined by algorithm hidden in it.
59 
60  qdisc's are divided to two categories:
61  - "queues", which have no internal structure visible from outside.
62  - "schedulers", which split all the packets to "traffic classes",
63  using "packet classifiers" (look at cls_api.c)
64 
65  In turn, classes may have child qdiscs (as rule, queues)
66  attached to them etc. etc. etc.
67 
68  The goal of the routines in this file is to translate
69  information supplied by user in the form of handles
70  to more intelligible for kernel form, to make some sanity
71  checks and part of work, which is common to all qdiscs
72  and to provide rtnetlink notifications.
73 
74  All real intelligent work is done inside qdisc modules.
75 
76 
77 
78  Every discipline has two major routines: enqueue and dequeue.
79 
80  ---dequeue
81 
82  dequeue usually returns a skb to send. It is allowed to return NULL,
83  but it does not mean that queue is empty, it just means that
84  discipline does not want to send anything this time.
85  Queue is really empty if q->q.qlen == 0.
86  For complicated disciplines with multiple queues q->q is not
87  real packet queue, but however q->q.qlen must be valid.
88 
89  ---enqueue
90 
91  enqueue returns 0, if packet was enqueued successfully.
92  If packet (this one or another one) was dropped, it returns
93  not zero error code.
94  NET_XMIT_DROP - this packet dropped
95  Expected action: do not backoff, but wait until queue will clear.
96  NET_XMIT_CN - probably this packet enqueued, but another one dropped.
97  Expected action: backoff or ignore
98  NET_XMIT_POLICED - dropped by police.
99  Expected action: backoff or error to real-time apps.
100 
101  Auxiliary routines:
102 
103  ---peek
104 
105  like dequeue but without removing a packet from the queue
106 
107  ---reset
108 
109  returns qdisc to initial state: purge all buffers, clear all
110  timers, counters (except for statistics) etc.
111 
112  ---init
113 
114  initializes newly created qdisc.
115 
116  ---destroy
117 
118  destroys resources allocated by init and during lifetime of qdisc.
119 
120  ---change
121 
122  changes qdisc parameters.
123  */
124 
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127 
128 
129 /************************************************
130  * Queueing disciplines manipulation. *
131  ************************************************/
132 
133 
134 /* The list of all installed queueing disciplines. */
135 
136 static struct Qdisc_ops *qdisc_base;
137 
138 /* Register/uregister queueing discipline */
139 
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142  struct Qdisc_ops *q, **qp;
143  int rc = -EEXIST;
144 
145  write_lock(&qdisc_mod_lock);
146  for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147  if (!strcmp(qops->id, q->id))
148  goto out;
149 
150  if (qops->enqueue == NULL)
151  qops->enqueue = noop_qdisc_ops.enqueue;
152  if (qops->peek == NULL) {
153  if (qops->dequeue == NULL)
154  qops->peek = noop_qdisc_ops.peek;
155  else
156  goto out_einval;
157  }
158  if (qops->dequeue == NULL)
159  qops->dequeue = noop_qdisc_ops.dequeue;
160 
161  if (qops->cl_ops) {
162  const struct Qdisc_class_ops *cops = qops->cl_ops;
163 
164  if (!(cops->get && cops->put && cops->walk && cops->leaf))
165  goto out_einval;
166 
167  if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168  goto out_einval;
169  }
170 
171  qops->next = NULL;
172  *qp = qops;
173  rc = 0;
174 out:
175  write_unlock(&qdisc_mod_lock);
176  return rc;
177 
178 out_einval:
179  rc = -EINVAL;
180  goto out;
181 }
183 
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186  struct Qdisc_ops *q, **qp;
187  int err = -ENOENT;
188 
189  write_lock(&qdisc_mod_lock);
190  for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191  if (q == qops)
192  break;
193  if (q) {
194  *qp = q->next;
195  q->next = NULL;
196  err = 0;
197  }
198  write_unlock(&qdisc_mod_lock);
199  return err;
200 }
202 
203 /* We know handle. Find qdisc among all qdisc's attached to device
204  (root qdisc, all its children, children of children etc.)
205  */
206 
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209  struct Qdisc *q;
210 
211  if (!(root->flags & TCQ_F_BUILTIN) &&
212  root->handle == handle)
213  return root;
214 
215  list_for_each_entry(q, &root->list, list) {
216  if (q->handle == handle)
217  return q;
218  }
219  return NULL;
220 }
221 
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224  if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225  list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227 
228 void qdisc_list_del(struct Qdisc *q)
229 {
230  if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231  list_del(&q->list);
232 }
234 
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237  struct Qdisc *q;
238 
239  q = qdisc_match_from_root(dev->qdisc, handle);
240  if (q)
241  goto out;
242 
243  if (dev_ingress_queue(dev))
244  q = qdisc_match_from_root(
245  dev_ingress_queue(dev)->qdisc_sleeping,
246  handle);
247 out:
248  return q;
249 }
250 
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253  unsigned long cl;
254  struct Qdisc *leaf;
255  const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256 
257  if (cops == NULL)
258  return NULL;
259  cl = cops->get(p, classid);
260 
261  if (cl == 0)
262  return NULL;
263  leaf = cops->leaf(p, cl);
264  cops->put(p, cl);
265  return leaf;
266 }
267 
268 /* Find queueing discipline by name */
269 
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272  struct Qdisc_ops *q = NULL;
273 
274  if (kind) {
275  read_lock(&qdisc_mod_lock);
276  for (q = qdisc_base; q; q = q->next) {
277  if (nla_strcmp(kind, q->id) == 0) {
278  if (!try_module_get(q->owner))
279  q = NULL;
280  break;
281  }
282  }
283  read_unlock(&qdisc_mod_lock);
284  }
285  return q;
286 }
287 
288 static struct qdisc_rate_table *qdisc_rtab_list;
289 
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292  struct qdisc_rate_table *rtab;
293 
294  for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295  if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296  rtab->refcnt++;
297  return rtab;
298  }
299  }
300 
301  if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302  nla_len(tab) != TC_RTAB_SIZE)
303  return NULL;
304 
305  rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306  if (rtab) {
307  rtab->rate = *r;
308  rtab->refcnt = 1;
309  memcpy(rtab->data, nla_data(tab), 1024);
310  rtab->next = qdisc_rtab_list;
311  qdisc_rtab_list = rtab;
312  }
313  return rtab;
314 }
316 
318 {
319  struct qdisc_rate_table *rtab, **rtabp;
320 
321  if (!tab || --tab->refcnt)
322  return;
323 
324  for (rtabp = &qdisc_rtab_list;
325  (rtab = *rtabp) != NULL;
326  rtabp = &rtab->next) {
327  if (rtab == tab) {
328  *rtabp = rtab->next;
329  kfree(rtab);
330  return;
331  }
332  }
333 }
335 
336 static LIST_HEAD(qdisc_stab_list);
337 static DEFINE_SPINLOCK(qdisc_stab_lock);
338 
339 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340  [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
341  [TCA_STAB_DATA] = { .type = NLA_BINARY },
342 };
343 
344 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345 {
346  struct nlattr *tb[TCA_STAB_MAX + 1];
347  struct qdisc_size_table *stab;
348  struct tc_sizespec *s;
349  unsigned int tsize = 0;
350  u16 *tab = NULL;
351  int err;
352 
353  err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354  if (err < 0)
355  return ERR_PTR(err);
356  if (!tb[TCA_STAB_BASE])
357  return ERR_PTR(-EINVAL);
358 
359  s = nla_data(tb[TCA_STAB_BASE]);
360 
361  if (s->tsize > 0) {
362  if (!tb[TCA_STAB_DATA])
363  return ERR_PTR(-EINVAL);
364  tab = nla_data(tb[TCA_STAB_DATA]);
365  tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366  }
367 
368  if (tsize != s->tsize || (!tab && tsize > 0))
369  return ERR_PTR(-EINVAL);
370 
371  spin_lock(&qdisc_stab_lock);
372 
373  list_for_each_entry(stab, &qdisc_stab_list, list) {
374  if (memcmp(&stab->szopts, s, sizeof(*s)))
375  continue;
376  if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377  continue;
378  stab->refcnt++;
379  spin_unlock(&qdisc_stab_lock);
380  return stab;
381  }
382 
383  spin_unlock(&qdisc_stab_lock);
384 
385  stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386  if (!stab)
387  return ERR_PTR(-ENOMEM);
388 
389  stab->refcnt = 1;
390  stab->szopts = *s;
391  if (tsize > 0)
392  memcpy(stab->data, tab, tsize * sizeof(u16));
393 
394  spin_lock(&qdisc_stab_lock);
395  list_add_tail(&stab->list, &qdisc_stab_list);
396  spin_unlock(&qdisc_stab_lock);
397 
398  return stab;
399 }
400 
401 static void stab_kfree_rcu(struct rcu_head *head)
402 {
403  kfree(container_of(head, struct qdisc_size_table, rcu));
404 }
405 
407 {
408  if (!tab)
409  return;
410 
411  spin_lock(&qdisc_stab_lock);
412 
413  if (--tab->refcnt == 0) {
414  list_del(&tab->list);
415  call_rcu_bh(&tab->rcu, stab_kfree_rcu);
416  }
417 
418  spin_unlock(&qdisc_stab_lock);
419 }
421 
422 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
423 {
424  struct nlattr *nest;
425 
426  nest = nla_nest_start(skb, TCA_STAB);
427  if (nest == NULL)
428  goto nla_put_failure;
429  if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
430  goto nla_put_failure;
431  nla_nest_end(skb, nest);
432 
433  return skb->len;
434 
435 nla_put_failure:
436  return -1;
437 }
438 
439 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
440 {
441  int pkt_len, slot;
442 
443  pkt_len = skb->len + stab->szopts.overhead;
444  if (unlikely(!stab->szopts.tsize))
445  goto out;
446 
447  slot = pkt_len + stab->szopts.cell_align;
448  if (unlikely(slot < 0))
449  slot = 0;
450 
451  slot >>= stab->szopts.cell_log;
452  if (likely(slot < stab->szopts.tsize))
453  pkt_len = stab->data[slot];
454  else
455  pkt_len = stab->data[stab->szopts.tsize - 1] *
456  (slot / stab->szopts.tsize) +
457  stab->data[slot % stab->szopts.tsize];
458 
459  pkt_len <<= stab->szopts.size_log;
460 out:
461  if (unlikely(pkt_len < 1))
462  pkt_len = 1;
463  qdisc_skb_cb(skb)->pkt_len = pkt_len;
464 }
466 
467 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
468 {
469  if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
470  pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
471  txt, qdisc->ops->id, qdisc->handle >> 16);
472  qdisc->flags |= TCQ_F_WARN_NONWC;
473  }
474 }
476 
477 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
478 {
479  struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
480  timer);
481 
482  qdisc_unthrottled(wd->qdisc);
483  __netif_schedule(qdisc_root(wd->qdisc));
484 
485  return HRTIMER_NORESTART;
486 }
487 
489 {
491  wd->timer.function = qdisc_watchdog;
492  wd->qdisc = qdisc;
493 }
495 
497 {
498  ktime_t time;
499 
501  &qdisc_root_sleeping(wd->qdisc)->state))
502  return;
503 
504  qdisc_throttled(wd->qdisc);
505  time = ktime_set(0, 0);
506  time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
507  hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
508 }
510 
512 {
513  hrtimer_cancel(&wd->timer);
514  qdisc_unthrottled(wd->qdisc);
515 }
517 
518 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
519 {
520  unsigned int size = n * sizeof(struct hlist_head), i;
521  struct hlist_head *h;
522 
523  if (size <= PAGE_SIZE)
524  h = kmalloc(size, GFP_KERNEL);
525  else
526  h = (struct hlist_head *)
528 
529  if (h != NULL) {
530  for (i = 0; i < n; i++)
531  INIT_HLIST_HEAD(&h[i]);
532  }
533  return h;
534 }
535 
536 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
537 {
538  unsigned int size = n * sizeof(struct hlist_head);
539 
540  if (size <= PAGE_SIZE)
541  kfree(h);
542  else
543  free_pages((unsigned long)h, get_order(size));
544 }
545 
546 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
547 {
548  struct Qdisc_class_common *cl;
549  struct hlist_node *n, *next;
550  struct hlist_head *nhash, *ohash;
551  unsigned int nsize, nmask, osize;
552  unsigned int i, h;
553 
554  /* Rehash when load factor exceeds 0.75 */
555  if (clhash->hashelems * 4 <= clhash->hashsize * 3)
556  return;
557  nsize = clhash->hashsize * 2;
558  nmask = nsize - 1;
559  nhash = qdisc_class_hash_alloc(nsize);
560  if (nhash == NULL)
561  return;
562 
563  ohash = clhash->hash;
564  osize = clhash->hashsize;
565 
566  sch_tree_lock(sch);
567  for (i = 0; i < osize; i++) {
568  hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
569  h = qdisc_class_hash(cl->classid, nmask);
570  hlist_add_head(&cl->hnode, &nhash[h]);
571  }
572  }
573  clhash->hash = nhash;
574  clhash->hashsize = nsize;
575  clhash->hashmask = nmask;
576  sch_tree_unlock(sch);
577 
578  qdisc_class_hash_free(ohash, osize);
579 }
581 
583 {
584  unsigned int size = 4;
585 
586  clhash->hash = qdisc_class_hash_alloc(size);
587  if (clhash->hash == NULL)
588  return -ENOMEM;
589  clhash->hashsize = size;
590  clhash->hashmask = size - 1;
591  clhash->hashelems = 0;
592  return 0;
593 }
595 
597 {
598  qdisc_class_hash_free(clhash->hash, clhash->hashsize);
599 }
601 
603  struct Qdisc_class_common *cl)
604 {
605  unsigned int h;
606 
607  INIT_HLIST_NODE(&cl->hnode);
608  h = qdisc_class_hash(cl->classid, clhash->hashmask);
609  hlist_add_head(&cl->hnode, &clhash->hash[h]);
610  clhash->hashelems++;
611 }
613 
615  struct Qdisc_class_common *cl)
616 {
617  hlist_del(&cl->hnode);
618  clhash->hashelems--;
619 }
621 
622 /* Allocate an unique handle from space managed by kernel
623  * Possible range is [8000-FFFF]:0000 (0x8000 values)
624  */
625 static u32 qdisc_alloc_handle(struct net_device *dev)
626 {
627  int i = 0x8000;
628  static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
629 
630  do {
631  autohandle += TC_H_MAKE(0x10000U, 0);
632  if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
633  autohandle = TC_H_MAKE(0x80000000U, 0);
634  if (!qdisc_lookup(dev, autohandle))
635  return autohandle;
636  cond_resched();
637  } while (--i > 0);
638 
639  return 0;
640 }
641 
642 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
643 {
644  const struct Qdisc_class_ops *cops;
645  unsigned long cl;
646  u32 parentid;
647 
648  if (n == 0)
649  return;
650  while ((parentid = sch->parent)) {
651  if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
652  return;
653 
654  sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
655  if (sch == NULL) {
656  WARN_ON(parentid != TC_H_ROOT);
657  return;
658  }
659  cops = sch->ops->cl_ops;
660  if (cops->qlen_notify) {
661  cl = cops->get(sch, parentid);
662  cops->qlen_notify(sch, cl);
663  cops->put(sch, cl);
664  }
665  sch->q.qlen -= n;
666  }
667 }
669 
670 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
671  struct nlmsghdr *n, u32 clid,
672  struct Qdisc *old, struct Qdisc *new)
673 {
674  if (new || old)
675  qdisc_notify(net, skb, n, clid, old, new);
676 
677  if (old)
678  qdisc_destroy(old);
679 }
680 
681 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
682  * to device "dev".
683  *
684  * When appropriate send a netlink notification using 'skb'
685  * and "n".
686  *
687  * On success, destroy old qdisc.
688  */
689 
690 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
691  struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
692  struct Qdisc *new, struct Qdisc *old)
693 {
694  struct Qdisc *q = old;
695  struct net *net = dev_net(dev);
696  int err = 0;
697 
698  if (parent == NULL) {
699  unsigned int i, num_q, ingress;
700 
701  ingress = 0;
702  num_q = dev->num_tx_queues;
703  if ((q && q->flags & TCQ_F_INGRESS) ||
704  (new && new->flags & TCQ_F_INGRESS)) {
705  num_q = 1;
706  ingress = 1;
707  if (!dev_ingress_queue(dev))
708  return -ENOENT;
709  }
710 
711  if (dev->flags & IFF_UP)
712  dev_deactivate(dev);
713 
714  if (new && new->ops->attach) {
715  new->ops->attach(new);
716  num_q = 0;
717  }
718 
719  for (i = 0; i < num_q; i++) {
720  struct netdev_queue *dev_queue = dev_ingress_queue(dev);
721 
722  if (!ingress)
723  dev_queue = netdev_get_tx_queue(dev, i);
724 
725  old = dev_graft_qdisc(dev_queue, new);
726  if (new && i > 0)
727  atomic_inc(&new->refcnt);
728 
729  if (!ingress)
730  qdisc_destroy(old);
731  }
732 
733  if (!ingress) {
734  notify_and_destroy(net, skb, n, classid,
735  dev->qdisc, new);
736  if (new && !new->ops->attach)
737  atomic_inc(&new->refcnt);
738  dev->qdisc = new ? : &noop_qdisc;
739  } else {
740  notify_and_destroy(net, skb, n, classid, old, new);
741  }
742 
743  if (dev->flags & IFF_UP)
744  dev_activate(dev);
745  } else {
746  const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
747 
748  err = -EOPNOTSUPP;
749  if (cops && cops->graft) {
750  unsigned long cl = cops->get(parent, classid);
751  if (cl) {
752  err = cops->graft(parent, cl, new, &old);
753  cops->put(parent, cl);
754  } else
755  err = -ENOENT;
756  }
757  if (!err)
758  notify_and_destroy(net, skb, n, classid, old, new);
759  }
760  return err;
761 }
762 
763 /* lockdep annotation is needed for ingress; egress gets it only for name */
764 static struct lock_class_key qdisc_tx_lock;
765 static struct lock_class_key qdisc_rx_lock;
766 
767 /*
768  Allocate and initialize new qdisc.
769 
770  Parameters are passed via opt.
771  */
772 
773 static struct Qdisc *
774 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
775  struct Qdisc *p, u32 parent, u32 handle,
776  struct nlattr **tca, int *errp)
777 {
778  int err;
779  struct nlattr *kind = tca[TCA_KIND];
780  struct Qdisc *sch;
781  struct Qdisc_ops *ops;
782  struct qdisc_size_table *stab;
783 
784  ops = qdisc_lookup_ops(kind);
785 #ifdef CONFIG_MODULES
786  if (ops == NULL && kind != NULL) {
787  char name[IFNAMSIZ];
788  if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
789  /* We dropped the RTNL semaphore in order to
790  * perform the module load. So, even if we
791  * succeeded in loading the module we have to
792  * tell the caller to replay the request. We
793  * indicate this using -EAGAIN.
794  * We replay the request because the device may
795  * go away in the mean time.
796  */
797  rtnl_unlock();
798  request_module("sch_%s", name);
799  rtnl_lock();
800  ops = qdisc_lookup_ops(kind);
801  if (ops != NULL) {
802  /* We will try again qdisc_lookup_ops,
803  * so don't keep a reference.
804  */
805  module_put(ops->owner);
806  err = -EAGAIN;
807  goto err_out;
808  }
809  }
810  }
811 #endif
812 
813  err = -ENOENT;
814  if (ops == NULL)
815  goto err_out;
816 
817  sch = qdisc_alloc(dev_queue, ops);
818  if (IS_ERR(sch)) {
819  err = PTR_ERR(sch);
820  goto err_out2;
821  }
822 
823  sch->parent = parent;
824 
825  if (handle == TC_H_INGRESS) {
826  sch->flags |= TCQ_F_INGRESS;
827  handle = TC_H_MAKE(TC_H_INGRESS, 0);
828  lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
829  } else {
830  if (handle == 0) {
831  handle = qdisc_alloc_handle(dev);
832  err = -ENOMEM;
833  if (handle == 0)
834  goto err_out3;
835  }
836  lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
837  }
838 
839  sch->handle = handle;
840 
841  if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
842  if (tca[TCA_STAB]) {
843  stab = qdisc_get_stab(tca[TCA_STAB]);
844  if (IS_ERR(stab)) {
845  err = PTR_ERR(stab);
846  goto err_out4;
847  }
848  rcu_assign_pointer(sch->stab, stab);
849  }
850  if (tca[TCA_RATE]) {
851  spinlock_t *root_lock;
852 
853  err = -EOPNOTSUPP;
854  if (sch->flags & TCQ_F_MQROOT)
855  goto err_out4;
856 
857  if ((sch->parent != TC_H_ROOT) &&
858  !(sch->flags & TCQ_F_INGRESS) &&
859  (!p || !(p->flags & TCQ_F_MQROOT)))
860  root_lock = qdisc_root_sleeping_lock(sch);
861  else
862  root_lock = qdisc_lock(sch);
863 
864  err = gen_new_estimator(&sch->bstats, &sch->rate_est,
865  root_lock, tca[TCA_RATE]);
866  if (err)
867  goto err_out4;
868  }
869 
870  qdisc_list_add(sch);
871 
872  return sch;
873  }
874 err_out3:
875  dev_put(dev);
876  kfree((char *) sch - sch->padded);
877 err_out2:
878  module_put(ops->owner);
879 err_out:
880  *errp = err;
881  return NULL;
882 
883 err_out4:
884  /*
885  * Any broken qdiscs that would require a ops->reset() here?
886  * The qdisc was never in action so it shouldn't be necessary.
887  */
889  if (ops->destroy)
890  ops->destroy(sch);
891  goto err_out3;
892 }
893 
894 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
895 {
896  struct qdisc_size_table *ostab, *stab = NULL;
897  int err = 0;
898 
899  if (tca[TCA_OPTIONS]) {
900  if (sch->ops->change == NULL)
901  return -EINVAL;
902  err = sch->ops->change(sch, tca[TCA_OPTIONS]);
903  if (err)
904  return err;
905  }
906 
907  if (tca[TCA_STAB]) {
908  stab = qdisc_get_stab(tca[TCA_STAB]);
909  if (IS_ERR(stab))
910  return PTR_ERR(stab);
911  }
912 
913  ostab = rtnl_dereference(sch->stab);
914  rcu_assign_pointer(sch->stab, stab);
915  qdisc_put_stab(ostab);
916 
917  if (tca[TCA_RATE]) {
918  /* NB: ignores errors from replace_estimator
919  because change can't be undone. */
920  if (sch->flags & TCQ_F_MQROOT)
921  goto out;
923  qdisc_root_sleeping_lock(sch),
924  tca[TCA_RATE]);
925  }
926 out:
927  return 0;
928 }
929 
931  struct qdisc_walker w;
932  struct Qdisc *p;
933  int depth;
934 };
935 
936 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
937 
938 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
939 {
940  struct check_loop_arg arg;
941 
942  if (q->ops->cl_ops == NULL)
943  return 0;
944 
945  arg.w.stop = arg.w.skip = arg.w.count = 0;
946  arg.w.fn = check_loop_fn;
947  arg.depth = depth;
948  arg.p = p;
949  q->ops->cl_ops->walk(q, &arg.w);
950  return arg.w.stop ? -ELOOP : 0;
951 }
952 
953 static int
954 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
955 {
956  struct Qdisc *leaf;
957  const struct Qdisc_class_ops *cops = q->ops->cl_ops;
958  struct check_loop_arg *arg = (struct check_loop_arg *)w;
959 
960  leaf = cops->leaf(q, cl);
961  if (leaf) {
962  if (leaf == arg->p || arg->depth > 7)
963  return -ELOOP;
964  return check_loop(leaf, arg->p, arg->depth + 1);
965  }
966  return 0;
967 }
968 
969 /*
970  * Delete/get qdisc.
971  */
972 
973 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
974 {
975  struct net *net = sock_net(skb->sk);
976  struct tcmsg *tcm = nlmsg_data(n);
977  struct nlattr *tca[TCA_MAX + 1];
978  struct net_device *dev;
979  u32 clid = tcm->tcm_parent;
980  struct Qdisc *q = NULL;
981  struct Qdisc *p = NULL;
982  int err;
983 
984  dev = __dev_get_by_index(net, tcm->tcm_ifindex);
985  if (!dev)
986  return -ENODEV;
987 
988  err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
989  if (err < 0)
990  return err;
991 
992  if (clid) {
993  if (clid != TC_H_ROOT) {
994  if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
995  p = qdisc_lookup(dev, TC_H_MAJ(clid));
996  if (!p)
997  return -ENOENT;
998  q = qdisc_leaf(p, clid);
999  } else if (dev_ingress_queue(dev)) {
1000  q = dev_ingress_queue(dev)->qdisc_sleeping;
1001  }
1002  } else {
1003  q = dev->qdisc;
1004  }
1005  if (!q)
1006  return -ENOENT;
1007 
1008  if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1009  return -EINVAL;
1010  } else {
1011  q = qdisc_lookup(dev, tcm->tcm_handle);
1012  if (!q)
1013  return -ENOENT;
1014  }
1015 
1016  if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1017  return -EINVAL;
1018 
1019  if (n->nlmsg_type == RTM_DELQDISC) {
1020  if (!clid)
1021  return -EINVAL;
1022  if (q->handle == 0)
1023  return -ENOENT;
1024  err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1025  if (err != 0)
1026  return err;
1027  } else {
1028  qdisc_notify(net, skb, n, clid, NULL, q);
1029  }
1030  return 0;
1031 }
1032 
1033 /*
1034  * Create/change qdisc.
1035  */
1036 
1037 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1038 {
1039  struct net *net = sock_net(skb->sk);
1040  struct tcmsg *tcm;
1041  struct nlattr *tca[TCA_MAX + 1];
1042  struct net_device *dev;
1043  u32 clid;
1044  struct Qdisc *q, *p;
1045  int err;
1046 
1047 replay:
1048  /* Reinit, just in case something touches this. */
1049  tcm = nlmsg_data(n);
1050  clid = tcm->tcm_parent;
1051  q = p = NULL;
1052 
1053  dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1054  if (!dev)
1055  return -ENODEV;
1056 
1057  err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1058  if (err < 0)
1059  return err;
1060 
1061  if (clid) {
1062  if (clid != TC_H_ROOT) {
1063  if (clid != TC_H_INGRESS) {
1064  p = qdisc_lookup(dev, TC_H_MAJ(clid));
1065  if (!p)
1066  return -ENOENT;
1067  q = qdisc_leaf(p, clid);
1068  } else if (dev_ingress_queue_create(dev)) {
1069  q = dev_ingress_queue(dev)->qdisc_sleeping;
1070  }
1071  } else {
1072  q = dev->qdisc;
1073  }
1074 
1075  /* It may be default qdisc, ignore it */
1076  if (q && q->handle == 0)
1077  q = NULL;
1078 
1079  if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1080  if (tcm->tcm_handle) {
1081  if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1082  return -EEXIST;
1083  if (TC_H_MIN(tcm->tcm_handle))
1084  return -EINVAL;
1085  q = qdisc_lookup(dev, tcm->tcm_handle);
1086  if (!q)
1087  goto create_n_graft;
1088  if (n->nlmsg_flags & NLM_F_EXCL)
1089  return -EEXIST;
1090  if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1091  return -EINVAL;
1092  if (q == p ||
1093  (p && check_loop(q, p, 0)))
1094  return -ELOOP;
1095  atomic_inc(&q->refcnt);
1096  goto graft;
1097  } else {
1098  if (!q)
1099  goto create_n_graft;
1100 
1101  /* This magic test requires explanation.
1102  *
1103  * We know, that some child q is already
1104  * attached to this parent and have choice:
1105  * either to change it or to create/graft new one.
1106  *
1107  * 1. We are allowed to create/graft only
1108  * if CREATE and REPLACE flags are set.
1109  *
1110  * 2. If EXCL is set, requestor wanted to say,
1111  * that qdisc tcm_handle is not expected
1112  * to exist, so that we choose create/graft too.
1113  *
1114  * 3. The last case is when no flags are set.
1115  * Alas, it is sort of hole in API, we
1116  * cannot decide what to do unambiguously.
1117  * For now we select create/graft, if
1118  * user gave KIND, which does not match existing.
1119  */
1120  if ((n->nlmsg_flags & NLM_F_CREATE) &&
1121  (n->nlmsg_flags & NLM_F_REPLACE) &&
1122  ((n->nlmsg_flags & NLM_F_EXCL) ||
1123  (tca[TCA_KIND] &&
1124  nla_strcmp(tca[TCA_KIND], q->ops->id))))
1125  goto create_n_graft;
1126  }
1127  }
1128  } else {
1129  if (!tcm->tcm_handle)
1130  return -EINVAL;
1131  q = qdisc_lookup(dev, tcm->tcm_handle);
1132  }
1133 
1134  /* Change qdisc parameters */
1135  if (q == NULL)
1136  return -ENOENT;
1137  if (n->nlmsg_flags & NLM_F_EXCL)
1138  return -EEXIST;
1139  if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1140  return -EINVAL;
1141  err = qdisc_change(q, tca);
1142  if (err == 0)
1143  qdisc_notify(net, skb, n, clid, NULL, q);
1144  return err;
1145 
1146 create_n_graft:
1147  if (!(n->nlmsg_flags & NLM_F_CREATE))
1148  return -ENOENT;
1149  if (clid == TC_H_INGRESS) {
1150  if (dev_ingress_queue(dev))
1151  q = qdisc_create(dev, dev_ingress_queue(dev), p,
1152  tcm->tcm_parent, tcm->tcm_parent,
1153  tca, &err);
1154  else
1155  err = -ENOENT;
1156  } else {
1157  struct netdev_queue *dev_queue;
1158 
1159  if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1160  dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1161  else if (p)
1162  dev_queue = p->dev_queue;
1163  else
1164  dev_queue = netdev_get_tx_queue(dev, 0);
1165 
1166  q = qdisc_create(dev, dev_queue, p,
1167  tcm->tcm_parent, tcm->tcm_handle,
1168  tca, &err);
1169  }
1170  if (q == NULL) {
1171  if (err == -EAGAIN)
1172  goto replay;
1173  return err;
1174  }
1175 
1176 graft:
1177  err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1178  if (err) {
1179  if (q)
1180  qdisc_destroy(q);
1181  return err;
1182  }
1183 
1184  return 0;
1185 }
1186 
1187 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1188  u32 portid, u32 seq, u16 flags, int event)
1189 {
1190  struct tcmsg *tcm;
1191  struct nlmsghdr *nlh;
1192  unsigned char *b = skb_tail_pointer(skb);
1193  struct gnet_dump d;
1194  struct qdisc_size_table *stab;
1195 
1196  nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1197  if (!nlh)
1198  goto out_nlmsg_trim;
1199  tcm = nlmsg_data(nlh);
1200  tcm->tcm_family = AF_UNSPEC;
1201  tcm->tcm__pad1 = 0;
1202  tcm->tcm__pad2 = 0;
1203  tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1204  tcm->tcm_parent = clid;
1205  tcm->tcm_handle = q->handle;
1206  tcm->tcm_info = atomic_read(&q->refcnt);
1207  if (nla_put_string(skb, TCA_KIND, q->ops->id))
1208  goto nla_put_failure;
1209  if (q->ops->dump && q->ops->dump(q, skb) < 0)
1210  goto nla_put_failure;
1211  q->qstats.qlen = q->q.qlen;
1212 
1213  stab = rtnl_dereference(q->stab);
1214  if (stab && qdisc_dump_stab(skb, stab) < 0)
1215  goto nla_put_failure;
1216 
1218  qdisc_root_sleeping_lock(q), &d) < 0)
1219  goto nla_put_failure;
1220 
1221  if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1222  goto nla_put_failure;
1223 
1224  if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1225  gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1226  gnet_stats_copy_queue(&d, &q->qstats) < 0)
1227  goto nla_put_failure;
1228 
1229  if (gnet_stats_finish_copy(&d) < 0)
1230  goto nla_put_failure;
1231 
1232  nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1233  return skb->len;
1234 
1235 out_nlmsg_trim:
1236 nla_put_failure:
1237  nlmsg_trim(skb, b);
1238  return -1;
1239 }
1240 
1241 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1242 {
1243  return (q->flags & TCQ_F_BUILTIN) ? true : false;
1244 }
1245 
1246 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1247  struct nlmsghdr *n, u32 clid,
1248  struct Qdisc *old, struct Qdisc *new)
1249 {
1250  struct sk_buff *skb;
1251  u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1252 
1253  skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1254  if (!skb)
1255  return -ENOBUFS;
1256 
1257  if (old && !tc_qdisc_dump_ignore(old)) {
1258  if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1259  0, RTM_DELQDISC) < 0)
1260  goto err_out;
1261  }
1262  if (new && !tc_qdisc_dump_ignore(new)) {
1263  if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1264  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1265  goto err_out;
1266  }
1267 
1268  if (skb->len)
1269  return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1270  n->nlmsg_flags & NLM_F_ECHO);
1271 
1272 err_out:
1273  kfree_skb(skb);
1274  return -EINVAL;
1275 }
1276 
1277 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1278  struct netlink_callback *cb,
1279  int *q_idx_p, int s_q_idx)
1280 {
1281  int ret = 0, q_idx = *q_idx_p;
1282  struct Qdisc *q;
1283 
1284  if (!root)
1285  return 0;
1286 
1287  q = root;
1288  if (q_idx < s_q_idx) {
1289  q_idx++;
1290  } else {
1291  if (!tc_qdisc_dump_ignore(q) &&
1292  tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1293  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1294  goto done;
1295  q_idx++;
1296  }
1297  list_for_each_entry(q, &root->list, list) {
1298  if (q_idx < s_q_idx) {
1299  q_idx++;
1300  continue;
1301  }
1302  if (!tc_qdisc_dump_ignore(q) &&
1303  tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1304  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1305  goto done;
1306  q_idx++;
1307  }
1308 
1309 out:
1310  *q_idx_p = q_idx;
1311  return ret;
1312 done:
1313  ret = -1;
1314  goto out;
1315 }
1316 
1317 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1318 {
1319  struct net *net = sock_net(skb->sk);
1320  int idx, q_idx;
1321  int s_idx, s_q_idx;
1322  struct net_device *dev;
1323 
1324  s_idx = cb->args[0];
1325  s_q_idx = q_idx = cb->args[1];
1326 
1327  rcu_read_lock();
1328  idx = 0;
1329  for_each_netdev_rcu(net, dev) {
1330  struct netdev_queue *dev_queue;
1331 
1332  if (idx < s_idx)
1333  goto cont;
1334  if (idx > s_idx)
1335  s_q_idx = 0;
1336  q_idx = 0;
1337 
1338  if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1339  goto done;
1340 
1341  dev_queue = dev_ingress_queue(dev);
1342  if (dev_queue &&
1343  tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1344  &q_idx, s_q_idx) < 0)
1345  goto done;
1346 
1347 cont:
1348  idx++;
1349  }
1350 
1351 done:
1352  rcu_read_unlock();
1353 
1354  cb->args[0] = idx;
1355  cb->args[1] = q_idx;
1356 
1357  return skb->len;
1358 }
1359 
1360 
1361 
1362 /************************************************
1363  * Traffic classes manipulation. *
1364  ************************************************/
1365 
1366 
1367 
1368 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1369 {
1370  struct net *net = sock_net(skb->sk);
1371  struct tcmsg *tcm = nlmsg_data(n);
1372  struct nlattr *tca[TCA_MAX + 1];
1373  struct net_device *dev;
1374  struct Qdisc *q = NULL;
1375  const struct Qdisc_class_ops *cops;
1376  unsigned long cl = 0;
1377  unsigned long new_cl;
1378  u32 portid = tcm->tcm_parent;
1379  u32 clid = tcm->tcm_handle;
1380  u32 qid = TC_H_MAJ(clid);
1381  int err;
1382 
1383  dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1384  if (!dev)
1385  return -ENODEV;
1386 
1387  err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1388  if (err < 0)
1389  return err;
1390 
1391  /*
1392  parent == TC_H_UNSPEC - unspecified parent.
1393  parent == TC_H_ROOT - class is root, which has no parent.
1394  parent == X:0 - parent is root class.
1395  parent == X:Y - parent is a node in hierarchy.
1396  parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1397 
1398  handle == 0:0 - generate handle from kernel pool.
1399  handle == 0:Y - class is X:Y, where X:0 is qdisc.
1400  handle == X:Y - clear.
1401  handle == X:0 - root class.
1402  */
1403 
1404  /* Step 1. Determine qdisc handle X:0 */
1405 
1406  if (portid != TC_H_ROOT) {
1407  u32 qid1 = TC_H_MAJ(portid);
1408 
1409  if (qid && qid1) {
1410  /* If both majors are known, they must be identical. */
1411  if (qid != qid1)
1412  return -EINVAL;
1413  } else if (qid1) {
1414  qid = qid1;
1415  } else if (qid == 0)
1416  qid = dev->qdisc->handle;
1417 
1418  /* Now qid is genuine qdisc handle consistent
1419  * both with parent and child.
1420  *
1421  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1422  */
1423  if (portid)
1424  portid = TC_H_MAKE(qid, portid);
1425  } else {
1426  if (qid == 0)
1427  qid = dev->qdisc->handle;
1428  }
1429 
1430  /* OK. Locate qdisc */
1431  q = qdisc_lookup(dev, qid);
1432  if (!q)
1433  return -ENOENT;
1434 
1435  /* An check that it supports classes */
1436  cops = q->ops->cl_ops;
1437  if (cops == NULL)
1438  return -EINVAL;
1439 
1440  /* Now try to get class */
1441  if (clid == 0) {
1442  if (portid == TC_H_ROOT)
1443  clid = qid;
1444  } else
1445  clid = TC_H_MAKE(qid, clid);
1446 
1447  if (clid)
1448  cl = cops->get(q, clid);
1449 
1450  if (cl == 0) {
1451  err = -ENOENT;
1452  if (n->nlmsg_type != RTM_NEWTCLASS ||
1453  !(n->nlmsg_flags & NLM_F_CREATE))
1454  goto out;
1455  } else {
1456  switch (n->nlmsg_type) {
1457  case RTM_NEWTCLASS:
1458  err = -EEXIST;
1459  if (n->nlmsg_flags & NLM_F_EXCL)
1460  goto out;
1461  break;
1462  case RTM_DELTCLASS:
1463  err = -EOPNOTSUPP;
1464  if (cops->delete)
1465  err = cops->delete(q, cl);
1466  if (err == 0)
1467  tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1468  goto out;
1469  case RTM_GETTCLASS:
1470  err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1471  goto out;
1472  default:
1473  err = -EINVAL;
1474  goto out;
1475  }
1476  }
1477 
1478  new_cl = cl;
1479  err = -EOPNOTSUPP;
1480  if (cops->change)
1481  err = cops->change(q, clid, portid, tca, &new_cl);
1482  if (err == 0)
1483  tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1484 
1485 out:
1486  if (cl)
1487  cops->put(q, cl);
1488 
1489  return err;
1490 }
1491 
1492 
1493 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1494  unsigned long cl,
1495  u32 portid, u32 seq, u16 flags, int event)
1496 {
1497  struct tcmsg *tcm;
1498  struct nlmsghdr *nlh;
1499  unsigned char *b = skb_tail_pointer(skb);
1500  struct gnet_dump d;
1501  const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1502 
1503  nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1504  if (!nlh)
1505  goto out_nlmsg_trim;
1506  tcm = nlmsg_data(nlh);
1507  tcm->tcm_family = AF_UNSPEC;
1508  tcm->tcm__pad1 = 0;
1509  tcm->tcm__pad2 = 0;
1510  tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1511  tcm->tcm_parent = q->handle;
1512  tcm->tcm_handle = q->handle;
1513  tcm->tcm_info = 0;
1514  if (nla_put_string(skb, TCA_KIND, q->ops->id))
1515  goto nla_put_failure;
1516  if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1517  goto nla_put_failure;
1518 
1520  qdisc_root_sleeping_lock(q), &d) < 0)
1521  goto nla_put_failure;
1522 
1523  if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1524  goto nla_put_failure;
1525 
1526  if (gnet_stats_finish_copy(&d) < 0)
1527  goto nla_put_failure;
1528 
1529  nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1530  return skb->len;
1531 
1532 out_nlmsg_trim:
1533 nla_put_failure:
1534  nlmsg_trim(skb, b);
1535  return -1;
1536 }
1537 
1538 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1539  struct nlmsghdr *n, struct Qdisc *q,
1540  unsigned long cl, int event)
1541 {
1542  struct sk_buff *skb;
1543  u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1544 
1545  skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1546  if (!skb)
1547  return -ENOBUFS;
1548 
1549  if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1550  kfree_skb(skb);
1551  return -EINVAL;
1552  }
1553 
1554  return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1555  n->nlmsg_flags & NLM_F_ECHO);
1556 }
1557 
1559  struct qdisc_walker w;
1560  struct sk_buff *skb;
1562 };
1563 
1564 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1565 {
1566  struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1567 
1568  return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1569  a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1570 }
1571 
1572 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1573  struct tcmsg *tcm, struct netlink_callback *cb,
1574  int *t_p, int s_t)
1575 {
1576  struct qdisc_dump_args arg;
1577 
1578  if (tc_qdisc_dump_ignore(q) ||
1579  *t_p < s_t || !q->ops->cl_ops ||
1580  (tcm->tcm_parent &&
1581  TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1582  (*t_p)++;
1583  return 0;
1584  }
1585  if (*t_p > s_t)
1586  memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1587  arg.w.fn = qdisc_class_dump;
1588  arg.skb = skb;
1589  arg.cb = cb;
1590  arg.w.stop = 0;
1591  arg.w.skip = cb->args[1];
1592  arg.w.count = 0;
1593  q->ops->cl_ops->walk(q, &arg.w);
1594  cb->args[1] = arg.w.count;
1595  if (arg.w.stop)
1596  return -1;
1597  (*t_p)++;
1598  return 0;
1599 }
1600 
1601 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1602  struct tcmsg *tcm, struct netlink_callback *cb,
1603  int *t_p, int s_t)
1604 {
1605  struct Qdisc *q;
1606 
1607  if (!root)
1608  return 0;
1609 
1610  if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1611  return -1;
1612 
1613  list_for_each_entry(q, &root->list, list) {
1614  if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1615  return -1;
1616  }
1617 
1618  return 0;
1619 }
1620 
1621 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1622 {
1623  struct tcmsg *tcm = nlmsg_data(cb->nlh);
1624  struct net *net = sock_net(skb->sk);
1625  struct netdev_queue *dev_queue;
1626  struct net_device *dev;
1627  int t, s_t;
1628 
1629  if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1630  return 0;
1631  dev = dev_get_by_index(net, tcm->tcm_ifindex);
1632  if (!dev)
1633  return 0;
1634 
1635  s_t = cb->args[0];
1636  t = 0;
1637 
1638  if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1639  goto done;
1640 
1641  dev_queue = dev_ingress_queue(dev);
1642  if (dev_queue &&
1643  tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1644  &t, s_t) < 0)
1645  goto done;
1646 
1647 done:
1648  cb->args[0] = t;
1649 
1650  dev_put(dev);
1651  return skb->len;
1652 }
1653 
1654 /* Main classifier routine: scans classifier chain attached
1655  * to this qdisc, (optionally) tests for protocol and asks
1656  * specific classifiers.
1657  */
1658 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1659  struct tcf_result *res)
1660 {
1661  __be16 protocol = skb->protocol;
1662  int err;
1663 
1664  for (; tp; tp = tp->next) {
1665  if (tp->protocol != protocol &&
1666  tp->protocol != htons(ETH_P_ALL))
1667  continue;
1668  err = tp->classify(skb, tp, res);
1669 
1670  if (err >= 0) {
1671 #ifdef CONFIG_NET_CLS_ACT
1672  if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1673  skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1674 #endif
1675  return err;
1676  }
1677  }
1678  return -1;
1679 }
1681 
1682 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1683  struct tcf_result *res)
1684 {
1685  int err = 0;
1686 #ifdef CONFIG_NET_CLS_ACT
1687  const struct tcf_proto *otp = tp;
1688 reclassify:
1689 #endif
1690 
1691  err = tc_classify_compat(skb, tp, res);
1692 #ifdef CONFIG_NET_CLS_ACT
1693  if (err == TC_ACT_RECLASSIFY) {
1694  u32 verd = G_TC_VERD(skb->tc_verd);
1695  tp = otp;
1696 
1697  if (verd++ >= MAX_REC_LOOP) {
1698  net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1699  tp->q->ops->id,
1700  tp->prio & 0xffff,
1701  ntohs(tp->protocol));
1702  return TC_ACT_SHOT;
1703  }
1704  skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1705  goto reclassify;
1706  }
1707 #endif
1708  return err;
1709 }
1711 
1712 void tcf_destroy(struct tcf_proto *tp)
1713 {
1714  tp->ops->destroy(tp);
1715  module_put(tp->ops->owner);
1716  kfree(tp);
1717 }
1718 
1720 {
1721  struct tcf_proto *tp;
1722 
1723  while ((tp = *fl) != NULL) {
1724  *fl = tp->next;
1725  tcf_destroy(tp);
1726  }
1727 }
1729 
1730 #ifdef CONFIG_PROC_FS
1731 static int psched_show(struct seq_file *seq, void *v)
1732 {
1733  struct timespec ts;
1734 
1736  seq_printf(seq, "%08x %08x %08x %08x\n",
1738  1000000,
1739  (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1740 
1741  return 0;
1742 }
1743 
1744 static int psched_open(struct inode *inode, struct file *file)
1745 {
1746  return single_open(file, psched_show, NULL);
1747 }
1748 
1749 static const struct file_operations psched_fops = {
1750  .owner = THIS_MODULE,
1751  .open = psched_open,
1752  .read = seq_read,
1753  .llseek = seq_lseek,
1754  .release = single_release,
1755 };
1756 
1757 static int __net_init psched_net_init(struct net *net)
1758 {
1759  struct proc_dir_entry *e;
1760 
1761  e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1762  if (e == NULL)
1763  return -ENOMEM;
1764 
1765  return 0;
1766 }
1767 
1768 static void __net_exit psched_net_exit(struct net *net)
1769 {
1770  proc_net_remove(net, "psched");
1771 }
1772 #else
1773 static int __net_init psched_net_init(struct net *net)
1774 {
1775  return 0;
1776 }
1777 
1778 static void __net_exit psched_net_exit(struct net *net)
1779 {
1780 }
1781 #endif
1782 
1783 static struct pernet_operations psched_net_ops = {
1784  .init = psched_net_init,
1785  .exit = psched_net_exit,
1786 };
1787 
1788 static int __init pktsched_init(void)
1789 {
1790  int err;
1791 
1792  err = register_pernet_subsys(&psched_net_ops);
1793  if (err) {
1794  pr_err("pktsched_init: "
1795  "cannot initialize per netns operations\n");
1796  return err;
1797  }
1798 
1799  register_qdisc(&pfifo_qdisc_ops);
1800  register_qdisc(&bfifo_qdisc_ops);
1801  register_qdisc(&pfifo_head_drop_qdisc_ops);
1802  register_qdisc(&mq_qdisc_ops);
1803 
1804  rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1805  rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1806  rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1807  rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1808  rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1809  rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1810 
1811  return 0;
1812 }
1813 
1814 subsys_initcall(pktsched_init);