Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
sch_cbq.c
Go to the documentation of this file.
1 /*
2  * net/sched/sch_cbq.c Class-Based Queueing discipline.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  *
9  * Authors: Alexey Kuznetsov, <[email protected]>
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 
23 
24 /* Class-Based Queueing (CBQ) algorithm.
25  =======================================
26 
27  Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28  Management Models for Packet Networks",
29  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30 
31  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32 
33  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34  Parameters", 1996
35 
36  [4] Sally Floyd and Michael Speer, "Experimental Results
37  for Class-Based Queueing", 1998, not published.
38 
39  -----------------------------------------------------------------------
40 
41  Algorithm skeleton was taken from NS simulator cbq.cc.
42  If someone wants to check this code against the LBL version,
43  he should take into account that ONLY the skeleton was borrowed,
44  the implementation is different. Particularly:
45 
46  --- The WRR algorithm is different. Our version looks more
47  reasonable (I hope) and works when quanta are allowed to be
48  less than MTU, which is always the case when real time classes
49  have small rates. Note, that the statement of [3] is
50  incomplete, delay may actually be estimated even if class
51  per-round allotment is less than MTU. Namely, if per-round
52  allotment is W*r_i, and r_1+...+r_k = r < 1
53 
54  delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55 
56  In the worst case we have IntServ estimate with D = W*r+k*MTU
57  and C = MTU*r. The proof (if correct at all) is trivial.
58 
59 
60  --- It seems that cbq-2.0 is not very accurate. At least, I cannot
61  interpret some places, which look like wrong translations
62  from NS. Anyone is advised to find these differences
63  and explain to me, why I am wrong 8).
64 
65  --- Linux has no EOI event, so that we cannot estimate true class
66  idle time. Workaround is to consider the next dequeue event
67  as sign that previous packet is finished. This is wrong because of
68  internal device queueing, but on a permanently loaded link it is true.
69  Moreover, combined with clock integrator, this scheme looks
70  very close to an ideal solution. */
71 
72 struct cbq_sched_data;
73 
74 
75 struct cbq_class {
77  struct cbq_class *next_alive; /* next class with backlog in this priority band */
78 
79 /* Parameters */
80  unsigned char priority; /* class priority */
81  unsigned char priority2; /* priority to be used after overlimit */
82  unsigned char ewma_log; /* time constant for idle time calculation */
83  unsigned char ovl_strategy;
84 #ifdef CONFIG_NET_CLS_ACT
85  unsigned char police;
86 #endif
87 
89 
90  /* Link-sharing scheduler parameters */
91  long maxidle; /* Class parameters: see below. */
92  long offtime;
93  long minidle;
96 
97  /* Overlimit strategy parameters */
98  void (*overlimit)(struct cbq_class *cl);
100 
101  /* General scheduler (WRR) parameters */
102  long allot;
103  long quantum; /* Allotment per WRR round */
104  long weight; /* Relative allotment: see below */
105 
106  struct Qdisc *qdisc; /* Ptr to CBQ discipline */
107  struct cbq_class *split; /* Ptr to split node */
108  struct cbq_class *share; /* Ptr to LS parent in the class tree */
109  struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
110  struct cbq_class *borrow; /* NULL if class is bandwidth limited;
111  parent otherwise */
112  struct cbq_class *sibling; /* Sibling chain */
113  struct cbq_class *children; /* Pointer to children chain */
114 
115  struct Qdisc *q; /* Elementary queueing discipline */
116 
117 
118 /* Variables */
119  unsigned char cpriority; /* Effective priority */
120  unsigned char delayed;
121  unsigned char level; /* level of the class in hierarchy:
122  0 for leaf classes, and maximal
123  level of children + 1 for nodes.
124  */
125 
126  psched_time_t last; /* Last end of service */
128  long avgidle;
129  long deficit; /* Saved deficit for WRR */
135 
137 
138  int refcnt;
139  int filters;
140 
142 };
143 
145  struct Qdisc_class_hash clhash; /* Hash table of all classes */
147  unsigned int quanta[TC_CBQ_MAXPRIO + 1];
148 
149  struct cbq_class link;
150 
151  unsigned int activemask;
152  struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
153  with backlog */
154 
155 #ifdef CONFIG_NET_CLS_ACT
156  struct cbq_class *rx_class;
157 #endif
160  int tx_len;
161  psched_time_t now; /* Cached timestamp */
162  psched_time_t now_rt; /* Cached real time */
163  unsigned int pmask;
164 
166  struct qdisc_watchdog watchdog; /* Watchdog timer,
167  started when CBQ has
168  backlog, but cannot
169  transmit just now */
171  int toplevel;
173 };
174 
175 
176 #define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
177 
178 static inline struct cbq_class *
179 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
180 {
181  struct Qdisc_class_common *clc;
182 
183  clc = qdisc_class_find(&q->clhash, classid);
184  if (clc == NULL)
185  return NULL;
186  return container_of(clc, struct cbq_class, common);
187 }
188 
189 #ifdef CONFIG_NET_CLS_ACT
190 
191 static struct cbq_class *
192 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
193 {
194  struct cbq_class *cl;
195 
196  for (cl = this->tparent; cl; cl = cl->tparent) {
197  struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
198 
199  if (new != NULL && new != this)
200  return new;
201  }
202  return NULL;
203 }
204 
205 #endif
206 
207 /* Classify packet. The procedure is pretty complicated, but
208  * it allows us to combine link sharing and priority scheduling
209  * transparently.
210  *
211  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
212  * so that it resolves to split nodes. Then packets are classified
213  * by logical priority, or a more specific classifier may be attached
214  * to the split node.
215  */
216 
217 static struct cbq_class *
218 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
219 {
220  struct cbq_sched_data *q = qdisc_priv(sch);
221  struct cbq_class *head = &q->link;
222  struct cbq_class **defmap;
223  struct cbq_class *cl = NULL;
224  u32 prio = skb->priority;
225  struct tcf_result res;
226 
227  /*
228  * Step 1. If skb->priority points to one of our classes, use it.
229  */
230  if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
231  (cl = cbq_class_lookup(q, prio)) != NULL)
232  return cl;
233 
235  for (;;) {
236  int result = 0;
237  defmap = head->defaults;
238 
239  /*
240  * Step 2+n. Apply classifier.
241  */
242  if (!head->filter_list ||
243  (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
244  goto fallback;
245 
246  cl = (void *)res.class;
247  if (!cl) {
248  if (TC_H_MAJ(res.classid))
249  cl = cbq_class_lookup(q, res.classid);
250  else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
251  cl = defmap[TC_PRIO_BESTEFFORT];
252 
253  if (cl == NULL)
254  goto fallback;
255  }
256  if (cl->level >= head->level)
257  goto fallback;
258 #ifdef CONFIG_NET_CLS_ACT
259  switch (result) {
260  case TC_ACT_QUEUED:
261  case TC_ACT_STOLEN:
263  case TC_ACT_SHOT:
264  return NULL;
265  case TC_ACT_RECLASSIFY:
266  return cbq_reclassify(skb, cl);
267  }
268 #endif
269  if (cl->level == 0)
270  return cl;
271 
272  /*
273  * Step 3+n. If classifier selected a link sharing class,
274  * apply agency specific classifier.
275  * Repeat this procdure until we hit a leaf node.
276  */
277  head = cl;
278  }
279 
280 fallback:
281  cl = head;
282 
283  /*
284  * Step 4. No success...
285  */
286  if (TC_H_MAJ(prio) == 0 &&
287  !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
288  !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
289  return head;
290 
291  return cl;
292 }
293 
294 /*
295  * A packet has just been enqueued on the empty class.
296  * cbq_activate_class adds it to the tail of active class list
297  * of its priority band.
298  */
299 
300 static inline void cbq_activate_class(struct cbq_class *cl)
301 {
302  struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
303  int prio = cl->cpriority;
304  struct cbq_class *cl_tail;
305 
306  cl_tail = q->active[prio];
307  q->active[prio] = cl;
308 
309  if (cl_tail != NULL) {
310  cl->next_alive = cl_tail->next_alive;
311  cl_tail->next_alive = cl;
312  } else {
313  cl->next_alive = cl;
314  q->activemask |= (1<<prio);
315  }
316 }
317 
318 /*
319  * Unlink class from active chain.
320  * Note that this same procedure is done directly in cbq_dequeue*
321  * during round-robin procedure.
322  */
323 
324 static void cbq_deactivate_class(struct cbq_class *this)
325 {
326  struct cbq_sched_data *q = qdisc_priv(this->qdisc);
327  int prio = this->cpriority;
328  struct cbq_class *cl;
329  struct cbq_class *cl_prev = q->active[prio];
330 
331  do {
332  cl = cl_prev->next_alive;
333  if (cl == this) {
334  cl_prev->next_alive = cl->next_alive;
335  cl->next_alive = NULL;
336 
337  if (cl == q->active[prio]) {
338  q->active[prio] = cl_prev;
339  if (cl == q->active[prio]) {
340  q->active[prio] = NULL;
341  q->activemask &= ~(1<<prio);
342  return;
343  }
344  }
345  return;
346  }
347  } while ((cl_prev = cl) != q->active[prio]);
348 }
349 
350 static void
351 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
352 {
353  int toplevel = q->toplevel;
354 
355  if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
356  psched_time_t now;
357  psched_tdiff_t incr;
358 
359  now = psched_get_time();
360  incr = now - q->now_rt;
361  now = q->now + incr;
362 
363  do {
364  if (cl->undertime < now) {
365  q->toplevel = cl->level;
366  return;
367  }
368  } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
369  }
370 }
371 
372 static int
373 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
374 {
375  struct cbq_sched_data *q = qdisc_priv(sch);
376  int uninitialized_var(ret);
377  struct cbq_class *cl = cbq_classify(skb, sch, &ret);
378 
379 #ifdef CONFIG_NET_CLS_ACT
380  q->rx_class = cl;
381 #endif
382  if (cl == NULL) {
383  if (ret & __NET_XMIT_BYPASS)
384  sch->qstats.drops++;
385  kfree_skb(skb);
386  return ret;
387  }
388 
389 #ifdef CONFIG_NET_CLS_ACT
390  cl->q->__parent = sch;
391 #endif
392  ret = qdisc_enqueue(skb, cl->q);
393  if (ret == NET_XMIT_SUCCESS) {
394  sch->q.qlen++;
395  cbq_mark_toplevel(q, cl);
396  if (!cl->next_alive)
397  cbq_activate_class(cl);
398  return ret;
399  }
400 
401  if (net_xmit_drop_count(ret)) {
402  sch->qstats.drops++;
403  cbq_mark_toplevel(q, cl);
404  cl->qstats.drops++;
405  }
406  return ret;
407 }
408 
409 /* Overlimit actions */
410 
411 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
412 
413 static void cbq_ovl_classic(struct cbq_class *cl)
414 {
415  struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
416  psched_tdiff_t delay = cl->undertime - q->now;
417 
418  if (!cl->delayed) {
419  delay += cl->offtime;
420 
421  /*
422  * Class goes to sleep, so that it will have no
423  * chance to work avgidle. Let's forgive it 8)
424  *
425  * BTW cbq-2.0 has a crap in this
426  * place, apparently they forgot to shift it by cl->ewma_log.
427  */
428  if (cl->avgidle < 0)
429  delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
430  if (cl->avgidle < cl->minidle)
431  cl->avgidle = cl->minidle;
432  if (delay <= 0)
433  delay = 1;
434  cl->undertime = q->now + delay;
435 
436  cl->xstats.overactions++;
437  cl->delayed = 1;
438  }
439  if (q->wd_expires == 0 || q->wd_expires > delay)
440  q->wd_expires = delay;
441 
442  /* Dirty work! We must schedule wakeups based on
443  * real available rate, rather than leaf rate,
444  * which may be tiny (even zero).
445  */
446  if (q->toplevel == TC_CBQ_MAXLEVEL) {
447  struct cbq_class *b;
448  psched_tdiff_t base_delay = q->wd_expires;
449 
450  for (b = cl->borrow; b; b = b->borrow) {
451  delay = b->undertime - q->now;
452  if (delay < base_delay) {
453  if (delay <= 0)
454  delay = 1;
455  base_delay = delay;
456  }
457  }
458 
459  q->wd_expires = base_delay;
460  }
461 }
462 
463 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
464  * they go overlimit
465  */
466 
467 static void cbq_ovl_rclassic(struct cbq_class *cl)
468 {
469  struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
470  struct cbq_class *this = cl;
471 
472  do {
473  if (cl->level > q->toplevel) {
474  cl = NULL;
475  break;
476  }
477  } while ((cl = cl->borrow) != NULL);
478 
479  if (cl == NULL)
480  cl = this;
481  cbq_ovl_classic(cl);
482 }
483 
484 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
485 
486 static void cbq_ovl_delay(struct cbq_class *cl)
487 {
488  struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
489  psched_tdiff_t delay = cl->undertime - q->now;
490 
492  &qdisc_root_sleeping(cl->qdisc)->state))
493  return;
494 
495  if (!cl->delayed) {
496  psched_time_t sched = q->now;
497  ktime_t expires;
498 
499  delay += cl->offtime;
500  if (cl->avgidle < 0)
501  delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
502  if (cl->avgidle < cl->minidle)
503  cl->avgidle = cl->minidle;
504  cl->undertime = q->now + delay;
505 
506  if (delay > 0) {
507  sched += delay + cl->penalty;
508  cl->penalized = sched;
510  q->pmask |= (1<<TC_CBQ_MAXPRIO);
511 
512  expires = ktime_set(0, 0);
513  expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
515  ktime_to_ns(ktime_sub(
516  hrtimer_get_expires(&q->delay_timer),
517  expires)) > 0)
518  hrtimer_set_expires(&q->delay_timer, expires);
520  cl->delayed = 1;
521  cl->xstats.overactions++;
522  return;
523  }
524  delay = 1;
525  }
526  if (q->wd_expires == 0 || q->wd_expires > delay)
527  q->wd_expires = delay;
528 }
529 
530 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
531 
532 static void cbq_ovl_lowprio(struct cbq_class *cl)
533 {
534  struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
535 
536  cl->penalized = q->now + cl->penalty;
537 
538  if (cl->cpriority != cl->priority2) {
539  cl->cpriority = cl->priority2;
540  q->pmask |= (1<<cl->cpriority);
541  cl->xstats.overactions++;
542  }
543  cbq_ovl_classic(cl);
544 }
545 
546 /* TC_CBQ_OVL_DROP: penalize class by dropping */
547 
548 static void cbq_ovl_drop(struct cbq_class *cl)
549 {
550  if (cl->q->ops->drop)
551  if (cl->q->ops->drop(cl->q))
552  cl->qdisc->q.qlen--;
553  cl->xstats.overactions++;
554  cbq_ovl_classic(cl);
555 }
556 
557 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
558  psched_time_t now)
559 {
560  struct cbq_class *cl;
561  struct cbq_class *cl_prev = q->active[prio];
562  psched_time_t sched = now;
563 
564  if (cl_prev == NULL)
565  return 0;
566 
567  do {
568  cl = cl_prev->next_alive;
569  if (now - cl->penalized > 0) {
570  cl_prev->next_alive = cl->next_alive;
571  cl->next_alive = NULL;
572  cl->cpriority = cl->priority;
573  cl->delayed = 0;
574  cbq_activate_class(cl);
575 
576  if (cl == q->active[prio]) {
577  q->active[prio] = cl_prev;
578  if (cl == q->active[prio]) {
579  q->active[prio] = NULL;
580  return 0;
581  }
582  }
583 
584  cl = cl_prev->next_alive;
585  } else if (sched - cl->penalized > 0)
586  sched = cl->penalized;
587  } while ((cl_prev = cl) != q->active[prio]);
588 
589  return sched - now;
590 }
591 
592 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
593 {
594  struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
595  delay_timer);
596  struct Qdisc *sch = q->watchdog.qdisc;
597  psched_time_t now;
598  psched_tdiff_t delay = 0;
599  unsigned int pmask;
600 
601  now = psched_get_time();
602 
603  pmask = q->pmask;
604  q->pmask = 0;
605 
606  while (pmask) {
607  int prio = ffz(~pmask);
609 
610  pmask &= ~(1<<prio);
611 
612  tmp = cbq_undelay_prio(q, prio, now);
613  if (tmp > 0) {
614  q->pmask |= 1<<prio;
615  if (tmp < delay || delay == 0)
616  delay = tmp;
617  }
618  }
619 
620  if (delay) {
621  ktime_t time;
622 
623  time = ktime_set(0, 0);
624  time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
626  }
627 
628  qdisc_unthrottled(sch);
629  __netif_schedule(qdisc_root(sch));
630  return HRTIMER_NORESTART;
631 }
632 
633 #ifdef CONFIG_NET_CLS_ACT
634 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
635 {
636  struct Qdisc *sch = child->__parent;
637  struct cbq_sched_data *q = qdisc_priv(sch);
638  struct cbq_class *cl = q->rx_class;
639 
640  q->rx_class = NULL;
641 
642  if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
643  int ret;
644 
645  cbq_mark_toplevel(q, cl);
646 
647  q->rx_class = cl;
648  cl->q->__parent = sch;
649 
650  ret = qdisc_enqueue(skb, cl->q);
651  if (ret == NET_XMIT_SUCCESS) {
652  sch->q.qlen++;
653  if (!cl->next_alive)
654  cbq_activate_class(cl);
655  return 0;
656  }
657  if (net_xmit_drop_count(ret))
658  sch->qstats.drops++;
659  return 0;
660  }
661 
662  sch->qstats.drops++;
663  return -1;
664 }
665 #endif
666 
667 /*
668  * It is mission critical procedure.
669  *
670  * We "regenerate" toplevel cutoff, if transmitting class
671  * has backlog and it is not regulated. It is not part of
672  * original CBQ description, but looks more reasonable.
673  * Probably, it is wrong. This question needs further investigation.
674  */
675 
676 static inline void
677 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
678  struct cbq_class *borrowed)
679 {
680  if (cl && q->toplevel >= borrowed->level) {
681  if (cl->q->q.qlen > 1) {
682  do {
683  if (borrowed->undertime == PSCHED_PASTPERFECT) {
684  q->toplevel = borrowed->level;
685  return;
686  }
687  } while ((borrowed = borrowed->borrow) != NULL);
688  }
689 #if 0
690  /* It is not necessary now. Uncommenting it
691  will save CPU cycles, but decrease fairness.
692  */
694 #endif
695  }
696 }
697 
698 static void
699 cbq_update(struct cbq_sched_data *q)
700 {
701  struct cbq_class *this = q->tx_class;
702  struct cbq_class *cl = this;
703  int len = q->tx_len;
704 
705  q->tx_class = NULL;
706 
707  for ( ; cl; cl = cl->share) {
708  long avgidle = cl->avgidle;
709  long idle;
710 
711  cl->bstats.packets++;
712  cl->bstats.bytes += len;
713 
714  /*
715  * (now - last) is total time between packet right edges.
716  * (last_pktlen/rate) is "virtual" busy time, so that
717  *
718  * idle = (now - last) - last_pktlen/rate
719  */
720 
721  idle = q->now - cl->last;
722  if ((unsigned long)idle > 128*1024*1024) {
723  avgidle = cl->maxidle;
724  } else {
725  idle -= L2T(cl, len);
726 
727  /* true_avgidle := (1-W)*true_avgidle + W*idle,
728  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
729  * cl->avgidle == true_avgidle/W,
730  * hence:
731  */
732  avgidle += idle - (avgidle>>cl->ewma_log);
733  }
734 
735  if (avgidle <= 0) {
736  /* Overlimit or at-limit */
737 
738  if (avgidle < cl->minidle)
739  avgidle = cl->minidle;
740 
741  cl->avgidle = avgidle;
742 
743  /* Calculate expected time, when this class
744  * will be allowed to send.
745  * It will occur, when:
746  * (1-W)*true_avgidle + W*delay = 0, i.e.
747  * idle = (1/W - 1)*(-true_avgidle)
748  * or
749  * idle = (1 - W)*(-cl->avgidle);
750  */
751  idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
752 
753  /*
754  * That is not all.
755  * To maintain the rate allocated to the class,
756  * we add to undertime virtual clock,
757  * necessary to complete transmitted packet.
758  * (len/phys_bandwidth has been already passed
759  * to the moment of cbq_update)
760  */
761 
762  idle -= L2T(&q->link, len);
763  idle += L2T(cl, len);
764 
765  cl->undertime = q->now + idle;
766  } else {
767  /* Underlimit */
768 
770  if (avgidle > cl->maxidle)
771  cl->avgidle = cl->maxidle;
772  else
773  cl->avgidle = avgidle;
774  }
775  cl->last = q->now;
776  }
777 
778  cbq_update_toplevel(q, this, q->tx_borrowed);
779 }
780 
781 static inline struct cbq_class *
782 cbq_under_limit(struct cbq_class *cl)
783 {
784  struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
785  struct cbq_class *this_cl = cl;
786 
787  if (cl->tparent == NULL)
788  return cl;
789 
790  if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
791  cl->delayed = 0;
792  return cl;
793  }
794 
795  do {
796  /* It is very suspicious place. Now overlimit
797  * action is generated for not bounded classes
798  * only if link is completely congested.
799  * Though it is in agree with ancestor-only paradigm,
800  * it looks very stupid. Particularly,
801  * it means that this chunk of code will either
802  * never be called or result in strong amplification
803  * of burstiness. Dangerous, silly, and, however,
804  * no another solution exists.
805  */
806  cl = cl->borrow;
807  if (!cl) {
808  this_cl->qstats.overlimits++;
809  this_cl->overlimit(this_cl);
810  return NULL;
811  }
812  if (cl->level > q->toplevel)
813  return NULL;
814  } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
815 
816  cl->delayed = 0;
817  return cl;
818 }
819 
820 static inline struct sk_buff *
821 cbq_dequeue_prio(struct Qdisc *sch, int prio)
822 {
823  struct cbq_sched_data *q = qdisc_priv(sch);
824  struct cbq_class *cl_tail, *cl_prev, *cl;
825  struct sk_buff *skb;
826  int deficit;
827 
828  cl_tail = cl_prev = q->active[prio];
829  cl = cl_prev->next_alive;
830 
831  do {
832  deficit = 0;
833 
834  /* Start round */
835  do {
836  struct cbq_class *borrow = cl;
837 
838  if (cl->q->q.qlen &&
839  (borrow = cbq_under_limit(cl)) == NULL)
840  goto skip_class;
841 
842  if (cl->deficit <= 0) {
843  /* Class exhausted its allotment per
844  * this round. Switch to the next one.
845  */
846  deficit = 1;
847  cl->deficit += cl->quantum;
848  goto next_class;
849  }
850 
851  skb = cl->q->dequeue(cl->q);
852 
853  /* Class did not give us any skb :-(
854  * It could occur even if cl->q->q.qlen != 0
855  * f.e. if cl->q == "tbf"
856  */
857  if (skb == NULL)
858  goto skip_class;
859 
860  cl->deficit -= qdisc_pkt_len(skb);
861  q->tx_class = cl;
862  q->tx_borrowed = borrow;
863  if (borrow != cl) {
864 #ifndef CBQ_XSTATS_BORROWS_BYTES
865  borrow->xstats.borrows++;
866  cl->xstats.borrows++;
867 #else
868  borrow->xstats.borrows += qdisc_pkt_len(skb);
869  cl->xstats.borrows += qdisc_pkt_len(skb);
870 #endif
871  }
872  q->tx_len = qdisc_pkt_len(skb);
873 
874  if (cl->deficit <= 0) {
875  q->active[prio] = cl;
876  cl = cl->next_alive;
877  cl->deficit += cl->quantum;
878  }
879  return skb;
880 
881 skip_class:
882  if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
883  /* Class is empty or penalized.
884  * Unlink it from active chain.
885  */
886  cl_prev->next_alive = cl->next_alive;
887  cl->next_alive = NULL;
888 
889  /* Did cl_tail point to it? */
890  if (cl == cl_tail) {
891  /* Repair it! */
892  cl_tail = cl_prev;
893 
894  /* Was it the last class in this band? */
895  if (cl == cl_tail) {
896  /* Kill the band! */
897  q->active[prio] = NULL;
898  q->activemask &= ~(1<<prio);
899  if (cl->q->q.qlen)
900  cbq_activate_class(cl);
901  return NULL;
902  }
903 
904  q->active[prio] = cl_tail;
905  }
906  if (cl->q->q.qlen)
907  cbq_activate_class(cl);
908 
909  cl = cl_prev;
910  }
911 
912 next_class:
913  cl_prev = cl;
914  cl = cl->next_alive;
915  } while (cl_prev != cl_tail);
916  } while (deficit);
917 
918  q->active[prio] = cl_prev;
919 
920  return NULL;
921 }
922 
923 static inline struct sk_buff *
924 cbq_dequeue_1(struct Qdisc *sch)
925 {
926  struct cbq_sched_data *q = qdisc_priv(sch);
927  struct sk_buff *skb;
928  unsigned int activemask;
929 
930  activemask = q->activemask & 0xFF;
931  while (activemask) {
932  int prio = ffz(~activemask);
933  activemask &= ~(1<<prio);
934  skb = cbq_dequeue_prio(sch, prio);
935  if (skb)
936  return skb;
937  }
938  return NULL;
939 }
940 
941 static struct sk_buff *
942 cbq_dequeue(struct Qdisc *sch)
943 {
944  struct sk_buff *skb;
945  struct cbq_sched_data *q = qdisc_priv(sch);
947  psched_tdiff_t incr;
948 
949  now = psched_get_time();
950  incr = now - q->now_rt;
951 
952  if (q->tx_class) {
953  psched_tdiff_t incr2;
954  /* Time integrator. We calculate EOS time
955  * by adding expected packet transmission time.
956  * If real time is greater, we warp artificial clock,
957  * so that:
958  *
959  * cbq_time = max(real_time, work);
960  */
961  incr2 = L2T(&q->link, q->tx_len);
962  q->now += incr2;
963  cbq_update(q);
964  if ((incr -= incr2) < 0)
965  incr = 0;
966  }
967  q->now += incr;
968  q->now_rt = now;
969 
970  for (;;) {
971  q->wd_expires = 0;
972 
973  skb = cbq_dequeue_1(sch);
974  if (skb) {
975  qdisc_bstats_update(sch, skb);
976  sch->q.qlen--;
977  qdisc_unthrottled(sch);
978  return skb;
979  }
980 
981  /* All the classes are overlimit.
982  *
983  * It is possible, if:
984  *
985  * 1. Scheduler is empty.
986  * 2. Toplevel cutoff inhibited borrowing.
987  * 3. Root class is overlimit.
988  *
989  * Reset 2d and 3d conditions and retry.
990  *
991  * Note, that NS and cbq-2.0 are buggy, peeking
992  * an arbitrary class is appropriate for ancestor-only
993  * sharing, but not for toplevel algorithm.
994  *
995  * Our version is better, but slower, because it requires
996  * two passes, but it is unavoidable with top-level sharing.
997  */
998 
999  if (q->toplevel == TC_CBQ_MAXLEVEL &&
1000  q->link.undertime == PSCHED_PASTPERFECT)
1001  break;
1002 
1004  q->link.undertime = PSCHED_PASTPERFECT;
1005  }
1006 
1007  /* No packets in scheduler or nobody wants to give them to us :-(
1008  * Sigh... start watchdog timer in the last case.
1009  */
1010 
1011  if (sch->q.qlen) {
1012  sch->qstats.overlimits++;
1013  if (q->wd_expires)
1015  now + q->wd_expires);
1016  }
1017  return NULL;
1018 }
1019 
1020 /* CBQ class maintanance routines */
1021 
1022 static void cbq_adjust_levels(struct cbq_class *this)
1023 {
1024  if (this == NULL)
1025  return;
1026 
1027  do {
1028  int level = 0;
1029  struct cbq_class *cl;
1030 
1031  cl = this->children;
1032  if (cl) {
1033  do {
1034  if (cl->level > level)
1035  level = cl->level;
1036  } while ((cl = cl->sibling) != this->children);
1037  }
1038  this->level = level + 1;
1039  } while ((this = this->tparent) != NULL);
1040 }
1041 
1042 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1043 {
1044  struct cbq_class *cl;
1045  struct hlist_node *n;
1046  unsigned int h;
1047 
1048  if (q->quanta[prio] == 0)
1049  return;
1050 
1051  for (h = 0; h < q->clhash.hashsize; h++) {
1052  hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1053  /* BUGGGG... Beware! This expression suffer of
1054  * arithmetic overflows!
1055  */
1056  if (cl->priority == prio) {
1057  cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1058  q->quanta[prio];
1059  }
1060  if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1061  pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
1062  cl->common.classid, cl->quantum);
1063  cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1064  }
1065  }
1066  }
1067 }
1068 
1069 static void cbq_sync_defmap(struct cbq_class *cl)
1070 {
1071  struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1072  struct cbq_class *split = cl->split;
1073  unsigned int h;
1074  int i;
1075 
1076  if (split == NULL)
1077  return;
1078 
1079  for (i = 0; i <= TC_PRIO_MAX; i++) {
1080  if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
1081  split->defaults[i] = NULL;
1082  }
1083 
1084  for (i = 0; i <= TC_PRIO_MAX; i++) {
1085  int level = split->level;
1086 
1087  if (split->defaults[i])
1088  continue;
1089 
1090  for (h = 0; h < q->clhash.hashsize; h++) {
1091  struct hlist_node *n;
1092  struct cbq_class *c;
1093 
1094  hlist_for_each_entry(c, n, &q->clhash.hash[h],
1095  common.hnode) {
1096  if (c->split == split && c->level < level &&
1097  c->defmap & (1<<i)) {
1098  split->defaults[i] = c;
1099  level = c->level;
1100  }
1101  }
1102  }
1103  }
1104 }
1105 
1106 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1107 {
1108  struct cbq_class *split = NULL;
1109 
1110  if (splitid == 0) {
1111  split = cl->split;
1112  if (!split)
1113  return;
1114  splitid = split->common.classid;
1115  }
1116 
1117  if (split == NULL || split->common.classid != splitid) {
1118  for (split = cl->tparent; split; split = split->tparent)
1119  if (split->common.classid == splitid)
1120  break;
1121  }
1122 
1123  if (split == NULL)
1124  return;
1125 
1126  if (cl->split != split) {
1127  cl->defmap = 0;
1128  cbq_sync_defmap(cl);
1129  cl->split = split;
1130  cl->defmap = def & mask;
1131  } else
1132  cl->defmap = (cl->defmap & ~mask) | (def & mask);
1133 
1134  cbq_sync_defmap(cl);
1135 }
1136 
1137 static void cbq_unlink_class(struct cbq_class *this)
1138 {
1139  struct cbq_class *cl, **clp;
1140  struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1141 
1142  qdisc_class_hash_remove(&q->clhash, &this->common);
1143 
1144  if (this->tparent) {
1145  clp = &this->sibling;
1146  cl = *clp;
1147  do {
1148  if (cl == this) {
1149  *clp = cl->sibling;
1150  break;
1151  }
1152  clp = &cl->sibling;
1153  } while ((cl = *clp) != this->sibling);
1154 
1155  if (this->tparent->children == this) {
1156  this->tparent->children = this->sibling;
1157  if (this->sibling == this)
1158  this->tparent->children = NULL;
1159  }
1160  } else {
1161  WARN_ON(this->sibling != this);
1162  }
1163 }
1164 
1165 static void cbq_link_class(struct cbq_class *this)
1166 {
1167  struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1168  struct cbq_class *parent = this->tparent;
1169 
1170  this->sibling = this;
1171  qdisc_class_hash_insert(&q->clhash, &this->common);
1172 
1173  if (parent == NULL)
1174  return;
1175 
1176  if (parent->children == NULL) {
1177  parent->children = this;
1178  } else {
1179  this->sibling = parent->children->sibling;
1180  parent->children->sibling = this;
1181  }
1182 }
1183 
1184 static unsigned int cbq_drop(struct Qdisc *sch)
1185 {
1186  struct cbq_sched_data *q = qdisc_priv(sch);
1187  struct cbq_class *cl, *cl_head;
1188  int prio;
1189  unsigned int len;
1190 
1191  for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1192  cl_head = q->active[prio];
1193  if (!cl_head)
1194  continue;
1195 
1196  cl = cl_head;
1197  do {
1198  if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1199  sch->q.qlen--;
1200  if (!cl->q->q.qlen)
1201  cbq_deactivate_class(cl);
1202  return len;
1203  }
1204  } while ((cl = cl->next_alive) != cl_head);
1205  }
1206  return 0;
1207 }
1208 
1209 static void
1210 cbq_reset(struct Qdisc *sch)
1211 {
1212  struct cbq_sched_data *q = qdisc_priv(sch);
1213  struct cbq_class *cl;
1214  struct hlist_node *n;
1215  int prio;
1216  unsigned int h;
1217 
1218  q->activemask = 0;
1219  q->pmask = 0;
1220  q->tx_class = NULL;
1221  q->tx_borrowed = NULL;
1225  q->now = psched_get_time();
1226  q->now_rt = q->now;
1227 
1228  for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1229  q->active[prio] = NULL;
1230 
1231  for (h = 0; h < q->clhash.hashsize; h++) {
1232  hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1233  qdisc_reset(cl->q);
1234 
1235  cl->next_alive = NULL;
1237  cl->avgidle = cl->maxidle;
1238  cl->deficit = cl->quantum;
1239  cl->cpriority = cl->priority;
1240  }
1241  }
1242  sch->q.qlen = 0;
1243 }
1244 
1245 
1246 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1247 {
1248  if (lss->change & TCF_CBQ_LSS_FLAGS) {
1249  cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1250  cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1251  }
1252  if (lss->change & TCF_CBQ_LSS_EWMA)
1253  cl->ewma_log = lss->ewma_log;
1254  if (lss->change & TCF_CBQ_LSS_AVPKT)
1255  cl->avpkt = lss->avpkt;
1256  if (lss->change & TCF_CBQ_LSS_MINIDLE)
1257  cl->minidle = -(long)lss->minidle;
1258  if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1259  cl->maxidle = lss->maxidle;
1260  cl->avgidle = lss->maxidle;
1261  }
1262  if (lss->change & TCF_CBQ_LSS_OFFTIME)
1263  cl->offtime = lss->offtime;
1264  return 0;
1265 }
1266 
1267 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1268 {
1269  q->nclasses[cl->priority]--;
1270  q->quanta[cl->priority] -= cl->weight;
1271  cbq_normalize_quanta(q, cl->priority);
1272 }
1273 
1274 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1275 {
1276  q->nclasses[cl->priority]++;
1277  q->quanta[cl->priority] += cl->weight;
1278  cbq_normalize_quanta(q, cl->priority);
1279 }
1280 
1281 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1282 {
1283  struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1284 
1285  if (wrr->allot)
1286  cl->allot = wrr->allot;
1287  if (wrr->weight)
1288  cl->weight = wrr->weight;
1289  if (wrr->priority) {
1290  cl->priority = wrr->priority - 1;
1291  cl->cpriority = cl->priority;
1292  if (cl->priority >= cl->priority2)
1293  cl->priority2 = TC_CBQ_MAXPRIO - 1;
1294  }
1295 
1296  cbq_addprio(q, cl);
1297  return 0;
1298 }
1299 
1300 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1301 {
1302  switch (ovl->strategy) {
1303  case TC_CBQ_OVL_CLASSIC:
1304  cl->overlimit = cbq_ovl_classic;
1305  break;
1306  case TC_CBQ_OVL_DELAY:
1307  cl->overlimit = cbq_ovl_delay;
1308  break;
1309  case TC_CBQ_OVL_LOWPRIO:
1310  if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
1311  ovl->priority2 - 1 <= cl->priority)
1312  return -EINVAL;
1313  cl->priority2 = ovl->priority2 - 1;
1314  cl->overlimit = cbq_ovl_lowprio;
1315  break;
1316  case TC_CBQ_OVL_DROP:
1317  cl->overlimit = cbq_ovl_drop;
1318  break;
1319  case TC_CBQ_OVL_RCLASSIC:
1320  cl->overlimit = cbq_ovl_rclassic;
1321  break;
1322  default:
1323  return -EINVAL;
1324  }
1325  cl->penalty = ovl->penalty;
1326  return 0;
1327 }
1328 
1329 #ifdef CONFIG_NET_CLS_ACT
1330 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1331 {
1332  cl->police = p->police;
1333 
1334  if (cl->q->handle) {
1335  if (p->police == TC_POLICE_RECLASSIFY)
1336  cl->q->reshape_fail = cbq_reshape_fail;
1337  else
1338  cl->q->reshape_fail = NULL;
1339  }
1340  return 0;
1341 }
1342 #endif
1343 
1344 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1345 {
1346  cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1347  return 0;
1348 }
1349 
1350 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1351  [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1352  [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1353  [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1354  [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1355  [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1356  [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1357  [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1358 };
1359 
1360 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1361 {
1362  struct cbq_sched_data *q = qdisc_priv(sch);
1363  struct nlattr *tb[TCA_CBQ_MAX + 1];
1364  struct tc_ratespec *r;
1365  int err;
1366 
1367  err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1368  if (err < 0)
1369  return err;
1370 
1371  if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1372  return -EINVAL;
1373 
1374  r = nla_data(tb[TCA_CBQ_RATE]);
1375 
1376  if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1377  return -EINVAL;
1378 
1379  err = qdisc_class_hash_init(&q->clhash);
1380  if (err < 0)
1381  goto put_rtab;
1382 
1383  q->link.refcnt = 1;
1384  q->link.sibling = &q->link;
1385  q->link.common.classid = sch->handle;
1386  q->link.qdisc = sch;
1387  q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1388  sch->handle);
1389  if (!q->link.q)
1390  q->link.q = &noop_qdisc;
1391 
1392  q->link.priority = TC_CBQ_MAXPRIO - 1;
1393  q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1394  q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1395  q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1396  q->link.overlimit = cbq_ovl_classic;
1397  q->link.allot = psched_mtu(qdisc_dev(sch));
1398  q->link.quantum = q->link.allot;
1399  q->link.weight = q->link.R_tab->rate.rate;
1400 
1401  q->link.ewma_log = TC_CBQ_DEF_EWMA;
1402  q->link.avpkt = q->link.allot/2;
1403  q->link.minidle = -0x7FFFFFFF;
1404 
1405  qdisc_watchdog_init(&q->watchdog, sch);
1407  q->delay_timer.function = cbq_undelay;
1409  q->now = psched_get_time();
1410  q->now_rt = q->now;
1411 
1412  cbq_link_class(&q->link);
1413 
1414  if (tb[TCA_CBQ_LSSOPT])
1415  cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1416 
1417  cbq_addprio(q, &q->link);
1418  return 0;
1419 
1420 put_rtab:
1421  qdisc_put_rtab(q->link.R_tab);
1422  return err;
1423 }
1424 
1425 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1426 {
1427  unsigned char *b = skb_tail_pointer(skb);
1428 
1429  if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1430  goto nla_put_failure;
1431  return skb->len;
1432 
1433 nla_put_failure:
1434  nlmsg_trim(skb, b);
1435  return -1;
1436 }
1437 
1438 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1439 {
1440  unsigned char *b = skb_tail_pointer(skb);
1441  struct tc_cbq_lssopt opt;
1442 
1443  opt.flags = 0;
1444  if (cl->borrow == NULL)
1445  opt.flags |= TCF_CBQ_LSS_BOUNDED;
1446  if (cl->share == NULL)
1447  opt.flags |= TCF_CBQ_LSS_ISOLATED;
1448  opt.ewma_log = cl->ewma_log;
1449  opt.level = cl->level;
1450  opt.avpkt = cl->avpkt;
1451  opt.maxidle = cl->maxidle;
1452  opt.minidle = (u32)(-cl->minidle);
1453  opt.offtime = cl->offtime;
1454  opt.change = ~0;
1455  if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1456  goto nla_put_failure;
1457  return skb->len;
1458 
1459 nla_put_failure:
1460  nlmsg_trim(skb, b);
1461  return -1;
1462 }
1463 
1464 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1465 {
1466  unsigned char *b = skb_tail_pointer(skb);
1467  struct tc_cbq_wrropt opt;
1468 
1469  opt.flags = 0;
1470  opt.allot = cl->allot;
1471  opt.priority = cl->priority + 1;
1472  opt.cpriority = cl->cpriority + 1;
1473  opt.weight = cl->weight;
1474  if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1475  goto nla_put_failure;
1476  return skb->len;
1477 
1478 nla_put_failure:
1479  nlmsg_trim(skb, b);
1480  return -1;
1481 }
1482 
1483 static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1484 {
1485  unsigned char *b = skb_tail_pointer(skb);
1486  struct tc_cbq_ovl opt;
1487 
1488  opt.strategy = cl->ovl_strategy;
1489  opt.priority2 = cl->priority2 + 1;
1490  opt.pad = 0;
1491  opt.penalty = cl->penalty;
1492  if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
1493  goto nla_put_failure;
1494  return skb->len;
1495 
1496 nla_put_failure:
1497  nlmsg_trim(skb, b);
1498  return -1;
1499 }
1500 
1501 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1502 {
1503  unsigned char *b = skb_tail_pointer(skb);
1504  struct tc_cbq_fopt opt;
1505 
1506  if (cl->split || cl->defmap) {
1507  opt.split = cl->split ? cl->split->common.classid : 0;
1508  opt.defmap = cl->defmap;
1509  opt.defchange = ~0;
1510  if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1511  goto nla_put_failure;
1512  }
1513  return skb->len;
1514 
1515 nla_put_failure:
1516  nlmsg_trim(skb, b);
1517  return -1;
1518 }
1519 
1520 #ifdef CONFIG_NET_CLS_ACT
1521 static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1522 {
1523  unsigned char *b = skb_tail_pointer(skb);
1524  struct tc_cbq_police opt;
1525 
1526  if (cl->police) {
1527  opt.police = cl->police;
1528  opt.__res1 = 0;
1529  opt.__res2 = 0;
1530  if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
1531  goto nla_put_failure;
1532  }
1533  return skb->len;
1534 
1535 nla_put_failure:
1536  nlmsg_trim(skb, b);
1537  return -1;
1538 }
1539 #endif
1540 
1541 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1542 {
1543  if (cbq_dump_lss(skb, cl) < 0 ||
1544  cbq_dump_rate(skb, cl) < 0 ||
1545  cbq_dump_wrr(skb, cl) < 0 ||
1546  cbq_dump_ovl(skb, cl) < 0 ||
1547 #ifdef CONFIG_NET_CLS_ACT
1548  cbq_dump_police(skb, cl) < 0 ||
1549 #endif
1550  cbq_dump_fopt(skb, cl) < 0)
1551  return -1;
1552  return 0;
1553 }
1554 
1555 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1556 {
1557  struct cbq_sched_data *q = qdisc_priv(sch);
1558  struct nlattr *nest;
1559 
1560  nest = nla_nest_start(skb, TCA_OPTIONS);
1561  if (nest == NULL)
1562  goto nla_put_failure;
1563  if (cbq_dump_attr(skb, &q->link) < 0)
1564  goto nla_put_failure;
1565  nla_nest_end(skb, nest);
1566  return skb->len;
1567 
1568 nla_put_failure:
1569  nla_nest_cancel(skb, nest);
1570  return -1;
1571 }
1572 
1573 static int
1574 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1575 {
1576  struct cbq_sched_data *q = qdisc_priv(sch);
1577 
1578  q->link.xstats.avgidle = q->link.avgidle;
1579  return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1580 }
1581 
1582 static int
1583 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1584  struct sk_buff *skb, struct tcmsg *tcm)
1585 {
1586  struct cbq_class *cl = (struct cbq_class *)arg;
1587  struct nlattr *nest;
1588 
1589  if (cl->tparent)
1590  tcm->tcm_parent = cl->tparent->common.classid;
1591  else
1592  tcm->tcm_parent = TC_H_ROOT;
1593  tcm->tcm_handle = cl->common.classid;
1594  tcm->tcm_info = cl->q->handle;
1595 
1596  nest = nla_nest_start(skb, TCA_OPTIONS);
1597  if (nest == NULL)
1598  goto nla_put_failure;
1599  if (cbq_dump_attr(skb, cl) < 0)
1600  goto nla_put_failure;
1601  nla_nest_end(skb, nest);
1602  return skb->len;
1603 
1604 nla_put_failure:
1605  nla_nest_cancel(skb, nest);
1606  return -1;
1607 }
1608 
1609 static int
1610 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1611  struct gnet_dump *d)
1612 {
1613  struct cbq_sched_data *q = qdisc_priv(sch);
1614  struct cbq_class *cl = (struct cbq_class *)arg;
1615 
1616  cl->qstats.qlen = cl->q->q.qlen;
1617  cl->xstats.avgidle = cl->avgidle;
1618  cl->xstats.undertime = 0;
1619 
1620  if (cl->undertime != PSCHED_PASTPERFECT)
1621  cl->xstats.undertime = cl->undertime - q->now;
1622 
1623  if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1624  gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1625  gnet_stats_copy_queue(d, &cl->qstats) < 0)
1626  return -1;
1627 
1628  return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1629 }
1630 
1631 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1632  struct Qdisc **old)
1633 {
1634  struct cbq_class *cl = (struct cbq_class *)arg;
1635 
1636  if (new == NULL) {
1637  new = qdisc_create_dflt(sch->dev_queue,
1638  &pfifo_qdisc_ops, cl->common.classid);
1639  if (new == NULL)
1640  return -ENOBUFS;
1641  } else {
1642 #ifdef CONFIG_NET_CLS_ACT
1643  if (cl->police == TC_POLICE_RECLASSIFY)
1644  new->reshape_fail = cbq_reshape_fail;
1645 #endif
1646  }
1647  sch_tree_lock(sch);
1648  *old = cl->q;
1649  cl->q = new;
1650  qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1651  qdisc_reset(*old);
1652  sch_tree_unlock(sch);
1653 
1654  return 0;
1655 }
1656 
1657 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1658 {
1659  struct cbq_class *cl = (struct cbq_class *)arg;
1660 
1661  return cl->q;
1662 }
1663 
1664 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1665 {
1666  struct cbq_class *cl = (struct cbq_class *)arg;
1667 
1668  if (cl->q->q.qlen == 0)
1669  cbq_deactivate_class(cl);
1670 }
1671 
1672 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1673 {
1674  struct cbq_sched_data *q = qdisc_priv(sch);
1675  struct cbq_class *cl = cbq_class_lookup(q, classid);
1676 
1677  if (cl) {
1678  cl->refcnt++;
1679  return (unsigned long)cl;
1680  }
1681  return 0;
1682 }
1683 
1684 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1685 {
1686  struct cbq_sched_data *q = qdisc_priv(sch);
1687 
1688  WARN_ON(cl->filters);
1689 
1691  qdisc_destroy(cl->q);
1692  qdisc_put_rtab(cl->R_tab);
1693  gen_kill_estimator(&cl->bstats, &cl->rate_est);
1694  if (cl != &q->link)
1695  kfree(cl);
1696 }
1697 
1698 static void cbq_destroy(struct Qdisc *sch)
1699 {
1700  struct cbq_sched_data *q = qdisc_priv(sch);
1701  struct hlist_node *n, *next;
1702  struct cbq_class *cl;
1703  unsigned int h;
1704 
1705 #ifdef CONFIG_NET_CLS_ACT
1706  q->rx_class = NULL;
1707 #endif
1708  /*
1709  * Filters must be destroyed first because we don't destroy the
1710  * classes from root to leafs which means that filters can still
1711  * be bound to classes which have been destroyed already. --TGR '04
1712  */
1713  for (h = 0; h < q->clhash.hashsize; h++) {
1714  hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1716  }
1717  for (h = 0; h < q->clhash.hashsize; h++) {
1718  hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1719  common.hnode)
1720  cbq_destroy_class(sch, cl);
1721  }
1722  qdisc_class_hash_destroy(&q->clhash);
1723 }
1724 
1725 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1726 {
1727  struct cbq_class *cl = (struct cbq_class *)arg;
1728 
1729  if (--cl->refcnt == 0) {
1730 #ifdef CONFIG_NET_CLS_ACT
1731  spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1732  struct cbq_sched_data *q = qdisc_priv(sch);
1733 
1734  spin_lock_bh(root_lock);
1735  if (q->rx_class == cl)
1736  q->rx_class = NULL;
1737  spin_unlock_bh(root_lock);
1738 #endif
1739 
1740  cbq_destroy_class(sch, cl);
1741  }
1742 }
1743 
1744 static int
1745 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1746  unsigned long *arg)
1747 {
1748  int err;
1749  struct cbq_sched_data *q = qdisc_priv(sch);
1750  struct cbq_class *cl = (struct cbq_class *)*arg;
1751  struct nlattr *opt = tca[TCA_OPTIONS];
1752  struct nlattr *tb[TCA_CBQ_MAX + 1];
1753  struct cbq_class *parent;
1754  struct qdisc_rate_table *rtab = NULL;
1755 
1756  if (opt == NULL)
1757  return -EINVAL;
1758 
1759  err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1760  if (err < 0)
1761  return err;
1762 
1763  if (cl) {
1764  /* Check parent */
1765  if (parentid) {
1766  if (cl->tparent &&
1767  cl->tparent->common.classid != parentid)
1768  return -EINVAL;
1769  if (!cl->tparent && parentid != TC_H_ROOT)
1770  return -EINVAL;
1771  }
1772 
1773  if (tb[TCA_CBQ_RATE]) {
1774  rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1775  tb[TCA_CBQ_RTAB]);
1776  if (rtab == NULL)
1777  return -EINVAL;
1778  }
1779 
1780  if (tca[TCA_RATE]) {
1781  err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1782  qdisc_root_sleeping_lock(sch),
1783  tca[TCA_RATE]);
1784  if (err) {
1785  if (rtab)
1786  qdisc_put_rtab(rtab);
1787  return err;
1788  }
1789  }
1790 
1791  /* Change class parameters */
1792  sch_tree_lock(sch);
1793 
1794  if (cl->next_alive != NULL)
1795  cbq_deactivate_class(cl);
1796 
1797  if (rtab) {
1798  qdisc_put_rtab(cl->R_tab);
1799  cl->R_tab = rtab;
1800  }
1801 
1802  if (tb[TCA_CBQ_LSSOPT])
1803  cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1804 
1805  if (tb[TCA_CBQ_WRROPT]) {
1806  cbq_rmprio(q, cl);
1807  cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1808  }
1809 
1810  if (tb[TCA_CBQ_OVL_STRATEGY])
1811  cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1812 
1813 #ifdef CONFIG_NET_CLS_ACT
1814  if (tb[TCA_CBQ_POLICE])
1815  cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1816 #endif
1817 
1818  if (tb[TCA_CBQ_FOPT])
1819  cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1820 
1821  if (cl->q->q.qlen)
1822  cbq_activate_class(cl);
1823 
1824  sch_tree_unlock(sch);
1825 
1826  return 0;
1827  }
1828 
1829  if (parentid == TC_H_ROOT)
1830  return -EINVAL;
1831 
1832  if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1833  tb[TCA_CBQ_LSSOPT] == NULL)
1834  return -EINVAL;
1835 
1836  rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1837  if (rtab == NULL)
1838  return -EINVAL;
1839 
1840  if (classid) {
1841  err = -EINVAL;
1842  if (TC_H_MAJ(classid ^ sch->handle) ||
1843  cbq_class_lookup(q, classid))
1844  goto failure;
1845  } else {
1846  int i;
1847  classid = TC_H_MAKE(sch->handle, 0x8000);
1848 
1849  for (i = 0; i < 0x8000; i++) {
1850  if (++q->hgenerator >= 0x8000)
1851  q->hgenerator = 1;
1852  if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1853  break;
1854  }
1855  err = -ENOSR;
1856  if (i >= 0x8000)
1857  goto failure;
1858  classid = classid|q->hgenerator;
1859  }
1860 
1861  parent = &q->link;
1862  if (parentid) {
1863  parent = cbq_class_lookup(q, parentid);
1864  err = -EINVAL;
1865  if (parent == NULL)
1866  goto failure;
1867  }
1868 
1869  err = -ENOBUFS;
1870  cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1871  if (cl == NULL)
1872  goto failure;
1873 
1874  if (tca[TCA_RATE]) {
1875  err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1876  qdisc_root_sleeping_lock(sch),
1877  tca[TCA_RATE]);
1878  if (err) {
1879  kfree(cl);
1880  goto failure;
1881  }
1882  }
1883 
1884  cl->R_tab = rtab;
1885  rtab = NULL;
1886  cl->refcnt = 1;
1887  cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1888  if (!cl->q)
1889  cl->q = &noop_qdisc;
1890  cl->common.classid = classid;
1891  cl->tparent = parent;
1892  cl->qdisc = sch;
1893  cl->allot = parent->allot;
1894  cl->quantum = cl->allot;
1895  cl->weight = cl->R_tab->rate.rate;
1896 
1897  sch_tree_lock(sch);
1898  cbq_link_class(cl);
1899  cl->borrow = cl->tparent;
1900  if (cl->tparent != &q->link)
1901  cl->share = cl->tparent;
1902  cbq_adjust_levels(parent);
1903  cl->minidle = -0x7FFFFFFF;
1904  cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1905  cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1906  if (cl->ewma_log == 0)
1907  cl->ewma_log = q->link.ewma_log;
1908  if (cl->maxidle == 0)
1909  cl->maxidle = q->link.maxidle;
1910  if (cl->avpkt == 0)
1911  cl->avpkt = q->link.avpkt;
1912  cl->overlimit = cbq_ovl_classic;
1913  if (tb[TCA_CBQ_OVL_STRATEGY])
1914  cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1915 #ifdef CONFIG_NET_CLS_ACT
1916  if (tb[TCA_CBQ_POLICE])
1917  cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1918 #endif
1919  if (tb[TCA_CBQ_FOPT])
1920  cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1921  sch_tree_unlock(sch);
1922 
1923  qdisc_class_hash_grow(sch, &q->clhash);
1924 
1925  *arg = (unsigned long)cl;
1926  return 0;
1927 
1928 failure:
1929  qdisc_put_rtab(rtab);
1930  return err;
1931 }
1932 
1933 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1934 {
1935  struct cbq_sched_data *q = qdisc_priv(sch);
1936  struct cbq_class *cl = (struct cbq_class *)arg;
1937  unsigned int qlen;
1938 
1939  if (cl->filters || cl->children || cl == &q->link)
1940  return -EBUSY;
1941 
1942  sch_tree_lock(sch);
1943 
1944  qlen = cl->q->q.qlen;
1945  qdisc_reset(cl->q);
1946  qdisc_tree_decrease_qlen(cl->q, qlen);
1947 
1948  if (cl->next_alive)
1949  cbq_deactivate_class(cl);
1950 
1951  if (q->tx_borrowed == cl)
1952  q->tx_borrowed = q->tx_class;
1953  if (q->tx_class == cl) {
1954  q->tx_class = NULL;
1955  q->tx_borrowed = NULL;
1956  }
1957 #ifdef CONFIG_NET_CLS_ACT
1958  if (q->rx_class == cl)
1959  q->rx_class = NULL;
1960 #endif
1961 
1962  cbq_unlink_class(cl);
1963  cbq_adjust_levels(cl->tparent);
1964  cl->defmap = 0;
1965  cbq_sync_defmap(cl);
1966 
1967  cbq_rmprio(q, cl);
1968  sch_tree_unlock(sch);
1969 
1970  BUG_ON(--cl->refcnt == 0);
1971  /*
1972  * This shouldn't happen: we "hold" one cops->get() when called
1973  * from tc_ctl_tclass; the destroy method is done from cops->put().
1974  */
1975 
1976  return 0;
1977 }
1978 
1979 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1980 {
1981  struct cbq_sched_data *q = qdisc_priv(sch);
1982  struct cbq_class *cl = (struct cbq_class *)arg;
1983 
1984  if (cl == NULL)
1985  cl = &q->link;
1986 
1987  return &cl->filter_list;
1988 }
1989 
1990 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1991  u32 classid)
1992 {
1993  struct cbq_sched_data *q = qdisc_priv(sch);
1994  struct cbq_class *p = (struct cbq_class *)parent;
1995  struct cbq_class *cl = cbq_class_lookup(q, classid);
1996 
1997  if (cl) {
1998  if (p && p->level <= cl->level)
1999  return 0;
2000  cl->filters++;
2001  return (unsigned long)cl;
2002  }
2003  return 0;
2004 }
2005 
2006 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2007 {
2008  struct cbq_class *cl = (struct cbq_class *)arg;
2009 
2010  cl->filters--;
2011 }
2012 
2013 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2014 {
2015  struct cbq_sched_data *q = qdisc_priv(sch);
2016  struct cbq_class *cl;
2017  struct hlist_node *n;
2018  unsigned int h;
2019 
2020  if (arg->stop)
2021  return;
2022 
2023  for (h = 0; h < q->clhash.hashsize; h++) {
2024  hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2025  if (arg->count < arg->skip) {
2026  arg->count++;
2027  continue;
2028  }
2029  if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2030  arg->stop = 1;
2031  return;
2032  }
2033  arg->count++;
2034  }
2035  }
2036 }
2037 
2038 static const struct Qdisc_class_ops cbq_class_ops = {
2039  .graft = cbq_graft,
2040  .leaf = cbq_leaf,
2041  .qlen_notify = cbq_qlen_notify,
2042  .get = cbq_get,
2043  .put = cbq_put,
2044  .change = cbq_change_class,
2045  .delete = cbq_delete,
2046  .walk = cbq_walk,
2047  .tcf_chain = cbq_find_tcf,
2048  .bind_tcf = cbq_bind_filter,
2049  .unbind_tcf = cbq_unbind_filter,
2050  .dump = cbq_dump_class,
2051  .dump_stats = cbq_dump_class_stats,
2052 };
2053 
2054 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2055  .next = NULL,
2056  .cl_ops = &cbq_class_ops,
2057  .id = "cbq",
2058  .priv_size = sizeof(struct cbq_sched_data),
2059  .enqueue = cbq_enqueue,
2060  .dequeue = cbq_dequeue,
2061  .peek = qdisc_peek_dequeued,
2062  .drop = cbq_drop,
2063  .init = cbq_init,
2064  .reset = cbq_reset,
2065  .destroy = cbq_destroy,
2066  .change = NULL,
2067  .dump = cbq_dump,
2068  .dump_stats = cbq_dump_stats,
2069  .owner = THIS_MODULE,
2070 };
2071 
2072 static int __init cbq_module_init(void)
2073 {
2074  return register_qdisc(&cbq_qdisc_ops);
2075 }
2076 static void __exit cbq_module_exit(void)
2077 {
2078  unregister_qdisc(&cbq_qdisc_ops);
2079 }
2080 module_init(cbq_module_init)
2081 module_exit(cbq_module_exit)
2082 MODULE_LICENSE("GPL");