Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
sch_tbf.c
Go to the documentation of this file.
1 /*
2  * net/sched/sch_tbf.c Token Bucket Filter queue.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  *
9  * Authors: Alexey Kuznetsov, <[email protected]>
10  * Dmitry Torokhov <[email protected]> - allow attaching inner qdiscs -
11  * original idea by Martin Devera
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <linux/types.h>
17 #include <linux/kernel.h>
18 #include <linux/string.h>
19 #include <linux/errno.h>
20 #include <linux/skbuff.h>
21 #include <net/netlink.h>
22 #include <net/pkt_sched.h>
23 
24 
25 /* Simple Token Bucket Filter.
26  =======================================
27 
28  SOURCE.
29  -------
30 
31  None.
32 
33  Description.
34  ------------
35 
36  A data flow obeys TBF with rate R and depth B, if for any
37  time interval t_i...t_f the number of transmitted bits
38  does not exceed B + R*(t_f-t_i).
39 
40  Packetized version of this definition:
41  The sequence of packets of sizes s_i served at moments t_i
42  obeys TBF, if for any i<=k:
43 
44  s_i+....+s_k <= B + R*(t_k - t_i)
45 
46  Algorithm.
47  ----------
48 
49  Let N(t_i) be B/R initially and N(t) grow continuously with time as:
50 
51  N(t+delta) = min{B/R, N(t) + delta}
52 
53  If the first packet in queue has length S, it may be
54  transmitted only at the time t_* when S/R <= N(t_*),
55  and in this case N(t) jumps:
56 
57  N(t_* + 0) = N(t_* - 0) - S/R.
58 
59 
60 
61  Actually, QoS requires two TBF to be applied to a data stream.
62  One of them controls steady state burst size, another
63  one with rate P (peak rate) and depth M (equal to link MTU)
64  limits bursts at a smaller time scale.
65 
66  It is easy to see that P>R, and B>M. If P is infinity, this double
67  TBF is equivalent to a single one.
68 
69  When TBF works in reshaping mode, latency is estimated as:
70 
71  lat = max ((L-B)/R, (L-M)/P)
72 
73 
74  NOTES.
75  ------
76 
77  If TBF throttles, it starts a watchdog timer, which will wake it up
78  when it is ready to transmit.
79  Note that the minimal timer resolution is 1/HZ.
80  If no new packets arrive during this period,
81  or if the device is not awaken by EOI for some previous packet,
82  TBF can stop its activity for 1/HZ.
83 
84 
85  This means, that with depth B, the maximal rate is
86 
87  R_crit = B*HZ
88 
89  F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
90 
91  Note that the peak rate TBF is much more tough: with MTU 1500
92  P_crit = 150Kbytes/sec. So, if you need greater peak
93  rates, use alpha with HZ=1000 :-)
94 
95  With classful TBF, limit is just kept for backwards compatibility.
96  It is passed to the default bfifo qdisc - if the inner qdisc is
97  changed the limit is not effective anymore.
98 */
99 
101 /* Parameters */
102  u32 limit; /* Maximal length of backlog: bytes */
103  u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
108 
109 /* Variables */
110  long tokens; /* Current number of B tokens */
111  long ptokens; /* Current number of P tokens */
112  psched_time_t t_c; /* Time check-point */
113  struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
114  struct qdisc_watchdog watchdog; /* Watchdog timer */
115 };
116 
117 #define L2T(q, L) qdisc_l2t((q)->R_tab, L)
118 #define L2T_P(q, L) qdisc_l2t((q)->P_tab, L)
119 
120 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
121 {
122  struct tbf_sched_data *q = qdisc_priv(sch);
123  int ret;
124 
125  if (qdisc_pkt_len(skb) > q->max_size)
126  return qdisc_reshape_fail(skb, sch);
127 
128  ret = qdisc_enqueue(skb, q->qdisc);
129  if (ret != NET_XMIT_SUCCESS) {
130  if (net_xmit_drop_count(ret))
131  sch->qstats.drops++;
132  return ret;
133  }
134 
135  sch->q.qlen++;
136  return NET_XMIT_SUCCESS;
137 }
138 
139 static unsigned int tbf_drop(struct Qdisc *sch)
140 {
141  struct tbf_sched_data *q = qdisc_priv(sch);
142  unsigned int len = 0;
143 
144  if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
145  sch->q.qlen--;
146  sch->qstats.drops++;
147  }
148  return len;
149 }
150 
151 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
152 {
153  struct tbf_sched_data *q = qdisc_priv(sch);
154  struct sk_buff *skb;
155 
156  skb = q->qdisc->ops->peek(q->qdisc);
157 
158  if (skb) {
159  psched_time_t now;
160  long toks;
161  long ptoks = 0;
162  unsigned int len = qdisc_pkt_len(skb);
163 
164  now = psched_get_time();
165  toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
166 
167  if (q->P_tab) {
168  ptoks = toks + q->ptokens;
169  if (ptoks > (long)q->mtu)
170  ptoks = q->mtu;
171  ptoks -= L2T_P(q, len);
172  }
173  toks += q->tokens;
174  if (toks > (long)q->buffer)
175  toks = q->buffer;
176  toks -= L2T(q, len);
177 
178  if ((toks|ptoks) >= 0) {
179  skb = qdisc_dequeue_peeked(q->qdisc);
180  if (unlikely(!skb))
181  return NULL;
182 
183  q->t_c = now;
184  q->tokens = toks;
185  q->ptokens = ptoks;
186  sch->q.qlen--;
187  qdisc_unthrottled(sch);
188  qdisc_bstats_update(sch, skb);
189  return skb;
190  }
191 
193  now + max_t(long, -toks, -ptoks));
194 
195  /* Maybe we have a shorter packet in the queue,
196  which can be sent now. It sounds cool,
197  but, however, this is wrong in principle.
198  We MUST NOT reorder packets under these circumstances.
199 
200  Really, if we split the flow into independent
201  subflows, it would be a very good solution.
202  This is the main idea of all FQ algorithms
203  (cf. CSZ, HPFQ, HFSC)
204  */
205 
206  sch->qstats.overlimits++;
207  }
208  return NULL;
209 }
210 
211 static void tbf_reset(struct Qdisc *sch)
212 {
213  struct tbf_sched_data *q = qdisc_priv(sch);
214 
215  qdisc_reset(q->qdisc);
216  sch->q.qlen = 0;
217  q->t_c = psched_get_time();
218  q->tokens = q->buffer;
219  q->ptokens = q->mtu;
221 }
222 
223 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
224  [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
225  [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
226  [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
227 };
228 
229 static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
230 {
231  int err;
232  struct tbf_sched_data *q = qdisc_priv(sch);
233  struct nlattr *tb[TCA_TBF_PTAB + 1];
234  struct tc_tbf_qopt *qopt;
235  struct qdisc_rate_table *rtab = NULL;
236  struct qdisc_rate_table *ptab = NULL;
237  struct Qdisc *child = NULL;
238  int max_size, n;
239 
240  err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
241  if (err < 0)
242  return err;
243 
244  err = -EINVAL;
245  if (tb[TCA_TBF_PARMS] == NULL)
246  goto done;
247 
248  qopt = nla_data(tb[TCA_TBF_PARMS]);
249  rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
250  if (rtab == NULL)
251  goto done;
252 
253  if (qopt->peakrate.rate) {
254  if (qopt->peakrate.rate > qopt->rate.rate)
255  ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
256  if (ptab == NULL)
257  goto done;
258  }
259 
260  for (n = 0; n < 256; n++)
261  if (rtab->data[n] > qopt->buffer)
262  break;
263  max_size = (n << qopt->rate.cell_log) - 1;
264  if (ptab) {
265  int size;
266 
267  for (n = 0; n < 256; n++)
268  if (ptab->data[n] > qopt->mtu)
269  break;
270  size = (n << qopt->peakrate.cell_log) - 1;
271  if (size < max_size)
272  max_size = size;
273  }
274  if (max_size < 0)
275  goto done;
276 
277  if (q->qdisc != &noop_qdisc) {
278  err = fifo_set_limit(q->qdisc, qopt->limit);
279  if (err)
280  goto done;
281  } else if (qopt->limit > 0) {
282  child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
283  if (IS_ERR(child)) {
284  err = PTR_ERR(child);
285  goto done;
286  }
287  }
288 
289  sch_tree_lock(sch);
290  if (child) {
291  qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
292  qdisc_destroy(q->qdisc);
293  q->qdisc = child;
294  }
295  q->limit = qopt->limit;
296  q->mtu = qopt->mtu;
297  q->max_size = max_size;
298  q->buffer = qopt->buffer;
299  q->tokens = q->buffer;
300  q->ptokens = q->mtu;
301 
302  swap(q->R_tab, rtab);
303  swap(q->P_tab, ptab);
304 
305  sch_tree_unlock(sch);
306  err = 0;
307 done:
308  if (rtab)
309  qdisc_put_rtab(rtab);
310  if (ptab)
311  qdisc_put_rtab(ptab);
312  return err;
313 }
314 
315 static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
316 {
317  struct tbf_sched_data *q = qdisc_priv(sch);
318 
319  if (opt == NULL)
320  return -EINVAL;
321 
322  q->t_c = psched_get_time();
323  qdisc_watchdog_init(&q->watchdog, sch);
324  q->qdisc = &noop_qdisc;
325 
326  return tbf_change(sch, opt);
327 }
328 
329 static void tbf_destroy(struct Qdisc *sch)
330 {
331  struct tbf_sched_data *q = qdisc_priv(sch);
332 
334 
335  if (q->P_tab)
336  qdisc_put_rtab(q->P_tab);
337  if (q->R_tab)
338  qdisc_put_rtab(q->R_tab);
339 
340  qdisc_destroy(q->qdisc);
341 }
342 
343 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
344 {
345  struct tbf_sched_data *q = qdisc_priv(sch);
346  struct nlattr *nest;
347  struct tc_tbf_qopt opt;
348 
349  sch->qstats.backlog = q->qdisc->qstats.backlog;
350  nest = nla_nest_start(skb, TCA_OPTIONS);
351  if (nest == NULL)
352  goto nla_put_failure;
353 
354  opt.limit = q->limit;
355  opt.rate = q->R_tab->rate;
356  if (q->P_tab)
357  opt.peakrate = q->P_tab->rate;
358  else
359  memset(&opt.peakrate, 0, sizeof(opt.peakrate));
360  opt.mtu = q->mtu;
361  opt.buffer = q->buffer;
362  if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
363  goto nla_put_failure;
364 
365  nla_nest_end(skb, nest);
366  return skb->len;
367 
368 nla_put_failure:
369  nla_nest_cancel(skb, nest);
370  return -1;
371 }
372 
373 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
374  struct sk_buff *skb, struct tcmsg *tcm)
375 {
376  struct tbf_sched_data *q = qdisc_priv(sch);
377 
378  tcm->tcm_handle |= TC_H_MIN(1);
379  tcm->tcm_info = q->qdisc->handle;
380 
381  return 0;
382 }
383 
384 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
385  struct Qdisc **old)
386 {
387  struct tbf_sched_data *q = qdisc_priv(sch);
388 
389  if (new == NULL)
390  new = &noop_qdisc;
391 
392  sch_tree_lock(sch);
393  *old = q->qdisc;
394  q->qdisc = new;
395  qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
396  qdisc_reset(*old);
397  sch_tree_unlock(sch);
398 
399  return 0;
400 }
401 
402 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
403 {
404  struct tbf_sched_data *q = qdisc_priv(sch);
405  return q->qdisc;
406 }
407 
408 static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
409 {
410  return 1;
411 }
412 
413 static void tbf_put(struct Qdisc *sch, unsigned long arg)
414 {
415 }
416 
417 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
418 {
419  if (!walker->stop) {
420  if (walker->count >= walker->skip)
421  if (walker->fn(sch, 1, walker) < 0) {
422  walker->stop = 1;
423  return;
424  }
425  walker->count++;
426  }
427 }
428 
429 static const struct Qdisc_class_ops tbf_class_ops = {
430  .graft = tbf_graft,
431  .leaf = tbf_leaf,
432  .get = tbf_get,
433  .put = tbf_put,
434  .walk = tbf_walk,
435  .dump = tbf_dump_class,
436 };
437 
438 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
439  .next = NULL,
440  .cl_ops = &tbf_class_ops,
441  .id = "tbf",
442  .priv_size = sizeof(struct tbf_sched_data),
443  .enqueue = tbf_enqueue,
444  .dequeue = tbf_dequeue,
445  .peek = qdisc_peek_dequeued,
446  .drop = tbf_drop,
447  .init = tbf_init,
448  .reset = tbf_reset,
449  .destroy = tbf_destroy,
450  .change = tbf_change,
451  .dump = tbf_dump,
452  .owner = THIS_MODULE,
453 };
454 
455 static int __init tbf_module_init(void)
456 {
457  return register_qdisc(&tbf_qdisc_ops);
458 }
459 
460 static void __exit tbf_module_exit(void)
461 {
462  unregister_qdisc(&tbf_qdisc_ops);
463 }
464 module_init(tbf_module_init)
465 module_exit(tbf_module_exit)
466 MODULE_LICENSE("GPL");