Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
sch_sfq.c
Go to the documentation of this file.
1 /*
2  * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  *
9  * Authors: Alexey Kuznetsov, <[email protected]>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/flow_keys.h>
27 #include <net/red.h>
28 
29 
30 /* Stochastic Fairness Queuing algorithm.
31  =======================================
32 
33  Source:
34  Paul E. McKenney "Stochastic Fairness Queuing",
35  IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37  Paul E. McKenney "Stochastic Fairness Queuing",
38  "Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41  See also:
42  M. Shreedhar and George Varghese "Efficient Fair
43  Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46  This is not the thing that is usually called (W)FQ nowadays.
47  It does not use any timestamp mechanism, but instead
48  processes queues in round-robin order.
49 
50  ADVANTAGE:
51 
52  - It is very cheap. Both CPU and memory requirements are minimal.
53 
54  DRAWBACKS:
55 
56  - "Stochastic" -> It is not 100% fair.
57  When hash collisions occur, several flows are considered as one.
58 
59  - "Round-robin" -> It introduces larger delays than virtual clock
60  based schemes, and should not be used for isolating interactive
61  traffic from non-interactive. It means, that this scheduler
62  should be used as leaf of CBQ or P3, which put interactive traffic
63  to higher priority band.
64 
65  We still need true WFQ for top level CSZ, but using WFQ
66  for the best effort traffic is absolutely pointless:
67  SFQ is superior for this purpose.
68 
69  IMPLEMENTATION:
70  This implementation limits :
71  - maximal queue length per flow to 127 packets.
72  - max mtu to 2^18-1;
73  - max 65408 flows,
74  - number of hash buckets to 65536.
75 
76  It is easy to increase these values, but not in flight. */
77 
78 #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS 128
80 #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT 0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT 3
88 #define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
102 };
103 
104 struct sfq_slot {
107  sfq_index qlen; /* number of skbs in skblist */
108  sfq_index next; /* next slot in sfq RR chain */
109  struct sfq_head dep; /* anchor in dep[] chains */
110  unsigned short hash; /* hash value (index in ht[]) */
111  short allot; /* credit for this slot */
112 
113  unsigned int backlog;
114  struct red_vars vars;
115 };
116 
118 /* frequently used fields */
119  int limit; /* limit of total number of packets in this qdisc */
120  unsigned int divisor; /* number of slots in hash table */
122  u8 maxdepth; /* limit of packets per flow */
123 
125  u8 cur_depth; /* depth of longest slot */
127  unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
129  sfq_index *ht; /* Hash table ('divisor' slots) */
130  struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
131 
134  struct sfq_slot *tail; /* current slot in round */
135 
137  /* Linked lists of slots, indexed by depth
138  * dep[0] : list of unused flows
139  * dep[1] : list of flows with 1 packet
140  * dep[X] : list of flows with X packets
141  */
142 
143  unsigned int maxflows; /* number of flows in flows array */
145  unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
147 };
148 
149 /*
150  * sfq_head are either in a sfq_slot or in dep[] array
151  */
152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153 {
154  if (val < SFQ_MAX_FLOWS)
155  return &q->slots[val].dep;
156  return &q->dep[val - SFQ_MAX_FLOWS];
157 }
158 
159 /*
160  * In order to be able to quickly rehash our queue when timer changes
161  * q->perturbation, we store flow_keys in skb->cb[]
162  */
163 struct sfq_skb_cb {
164  struct flow_keys keys;
165 };
166 
167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
168 {
169  qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
170  return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
171 }
172 
173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
174  const struct sk_buff *skb)
175 {
176  const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
177  unsigned int hash;
178 
179  hash = jhash_3words((__force u32)keys->dst,
180  (__force u32)keys->src ^ keys->ip_proto,
181  (__force u32)keys->ports, q->perturbation);
182  return hash & (q->divisor - 1);
183 }
184 
185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
186  int *qerr)
187 {
188  struct sfq_sched_data *q = qdisc_priv(sch);
189  struct tcf_result res;
190  int result;
191 
192  if (TC_H_MAJ(skb->priority) == sch->handle &&
193  TC_H_MIN(skb->priority) > 0 &&
194  TC_H_MIN(skb->priority) <= q->divisor)
195  return TC_H_MIN(skb->priority);
196 
197  if (!q->filter_list) {
198  skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
199  return sfq_hash(q, skb) + 1;
200  }
201 
203  result = tc_classify(skb, q->filter_list, &res);
204  if (result >= 0) {
205 #ifdef CONFIG_NET_CLS_ACT
206  switch (result) {
207  case TC_ACT_STOLEN:
208  case TC_ACT_QUEUED:
210  case TC_ACT_SHOT:
211  return 0;
212  }
213 #endif
214  if (TC_H_MIN(res.classid) <= q->divisor)
215  return TC_H_MIN(res.classid);
216  }
217  return 0;
218 }
219 
220 /*
221  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
222  */
223 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
224 {
225  sfq_index p, n;
226  struct sfq_slot *slot = &q->slots[x];
227  int qlen = slot->qlen;
228 
229  p = qlen + SFQ_MAX_FLOWS;
230  n = q->dep[qlen].next;
231 
232  slot->dep.next = n;
233  slot->dep.prev = p;
234 
235  q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
236  sfq_dep_head(q, n)->prev = x;
237 }
238 
239 #define sfq_unlink(q, x, n, p) \
240  n = q->slots[x].dep.next; \
241  p = q->slots[x].dep.prev; \
242  sfq_dep_head(q, p)->next = n; \
243  sfq_dep_head(q, n)->prev = p
244 
245 
246 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
247 {
248  sfq_index p, n;
249  int d;
250 
251  sfq_unlink(q, x, n, p);
252 
253  d = q->slots[x].qlen--;
254  if (n == p && q->cur_depth == d)
255  q->cur_depth--;
256  sfq_link(q, x);
257 }
258 
259 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
260 {
261  sfq_index p, n;
262  int d;
263 
264  sfq_unlink(q, x, n, p);
265 
266  d = ++q->slots[x].qlen;
267  if (q->cur_depth < d)
268  q->cur_depth = d;
269  sfq_link(q, x);
270 }
271 
272 /* helper functions : might be changed when/if skb use a standard list_head */
273 
274 /* remove one skb from tail of slot queue */
275 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
276 {
277  struct sk_buff *skb = slot->skblist_prev;
278 
279  slot->skblist_prev = skb->prev;
280  skb->prev->next = (struct sk_buff *)slot;
281  skb->next = skb->prev = NULL;
282  return skb;
283 }
284 
285 /* remove one skb from head of slot queue */
286 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
287 {
288  struct sk_buff *skb = slot->skblist_next;
289 
290  slot->skblist_next = skb->next;
291  skb->next->prev = (struct sk_buff *)slot;
292  skb->next = skb->prev = NULL;
293  return skb;
294 }
295 
296 static inline void slot_queue_init(struct sfq_slot *slot)
297 {
298  memset(slot, 0, sizeof(*slot));
299  slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
300 }
301 
302 /* add skb to slot queue (tail add) */
303 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
304 {
305  skb->prev = slot->skblist_prev;
306  skb->next = (struct sk_buff *)slot;
307  slot->skblist_prev->next = skb;
308  slot->skblist_prev = skb;
309 }
310 
311 #define slot_queue_walk(slot, skb) \
312  for (skb = slot->skblist_next; \
313  skb != (struct sk_buff *)slot; \
314  skb = skb->next)
315 
316 static unsigned int sfq_drop(struct Qdisc *sch)
317 {
318  struct sfq_sched_data *q = qdisc_priv(sch);
319  sfq_index x, d = q->cur_depth;
320  struct sk_buff *skb;
321  unsigned int len;
322  struct sfq_slot *slot;
323 
324  /* Queue is full! Find the longest slot and drop tail packet from it */
325  if (d > 1) {
326  x = q->dep[d].next;
327  slot = &q->slots[x];
328 drop:
329  skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
330  len = qdisc_pkt_len(skb);
331  slot->backlog -= len;
332  sfq_dec(q, x);
333  kfree_skb(skb);
334  sch->q.qlen--;
335  sch->qstats.drops++;
336  sch->qstats.backlog -= len;
337  return len;
338  }
339 
340  if (d == 1) {
341  /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
342  x = q->tail->next;
343  slot = &q->slots[x];
344  q->tail->next = slot->next;
345  q->ht[slot->hash] = SFQ_EMPTY_SLOT;
346  goto drop;
347  }
348 
349  return 0;
350 }
351 
352 /* Is ECN parameter configured */
353 static int sfq_prob_mark(const struct sfq_sched_data *q)
354 {
355  return q->flags & TC_RED_ECN;
356 }
357 
358 /* Should packets over max threshold just be marked */
359 static int sfq_hard_mark(const struct sfq_sched_data *q)
360 {
361  return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
362 }
363 
364 static int sfq_headdrop(const struct sfq_sched_data *q)
365 {
366  return q->headdrop;
367 }
368 
369 static int
370 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
371 {
372  struct sfq_sched_data *q = qdisc_priv(sch);
373  unsigned int hash;
374  sfq_index x, qlen;
375  struct sfq_slot *slot;
376  int uninitialized_var(ret);
377  struct sk_buff *head;
378  int delta;
379 
380  hash = sfq_classify(skb, sch, &ret);
381  if (hash == 0) {
382  if (ret & __NET_XMIT_BYPASS)
383  sch->qstats.drops++;
384  kfree_skb(skb);
385  return ret;
386  }
387  hash--;
388 
389  x = q->ht[hash];
390  slot = &q->slots[x];
391  if (x == SFQ_EMPTY_SLOT) {
392  x = q->dep[0].next; /* get a free slot */
393  if (x >= SFQ_MAX_FLOWS)
394  return qdisc_drop(skb, sch);
395  q->ht[hash] = x;
396  slot = &q->slots[x];
397  slot->hash = hash;
398  slot->backlog = 0; /* should already be 0 anyway... */
399  red_set_vars(&slot->vars);
400  goto enqueue;
401  }
402  if (q->red_parms) {
403  slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
404  &slot->vars,
405  slot->backlog);
406  switch (red_action(q->red_parms,
407  &slot->vars,
408  slot->vars.qavg)) {
409  case RED_DONT_MARK:
410  break;
411 
412  case RED_PROB_MARK:
413  sch->qstats.overlimits++;
414  if (sfq_prob_mark(q)) {
415  /* We know we have at least one packet in queue */
416  if (sfq_headdrop(q) &&
417  INET_ECN_set_ce(slot->skblist_next)) {
418  q->stats.prob_mark_head++;
419  break;
420  }
421  if (INET_ECN_set_ce(skb)) {
422  q->stats.prob_mark++;
423  break;
424  }
425  }
426  q->stats.prob_drop++;
427  goto congestion_drop;
428 
429  case RED_HARD_MARK:
430  sch->qstats.overlimits++;
431  if (sfq_hard_mark(q)) {
432  /* We know we have at least one packet in queue */
433  if (sfq_headdrop(q) &&
434  INET_ECN_set_ce(slot->skblist_next)) {
435  q->stats.forced_mark_head++;
436  break;
437  }
438  if (INET_ECN_set_ce(skb)) {
439  q->stats.forced_mark++;
440  break;
441  }
442  }
443  q->stats.forced_drop++;
444  goto congestion_drop;
445  }
446  }
447 
448  if (slot->qlen >= q->maxdepth) {
449 congestion_drop:
450  if (!sfq_headdrop(q))
451  return qdisc_drop(skb, sch);
452 
453  /* We know we have at least one packet in queue */
454  head = slot_dequeue_head(slot);
455  delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
456  sch->qstats.backlog -= delta;
457  slot->backlog -= delta;
458  qdisc_drop(head, sch);
459 
460  slot_queue_add(slot, skb);
461  return NET_XMIT_CN;
462  }
463 
464 enqueue:
465  sch->qstats.backlog += qdisc_pkt_len(skb);
466  slot->backlog += qdisc_pkt_len(skb);
467  slot_queue_add(slot, skb);
468  sfq_inc(q, x);
469  if (slot->qlen == 1) { /* The flow is new */
470  if (q->tail == NULL) { /* It is the first flow */
471  slot->next = x;
472  } else {
473  slot->next = q->tail->next;
474  q->tail->next = x;
475  }
476  /* We put this flow at the end of our flow list.
477  * This might sound unfair for a new flow to wait after old ones,
478  * but we could endup servicing new flows only, and freeze old ones.
479  */
480  q->tail = slot;
481  /* We could use a bigger initial quantum for new flows */
482  slot->allot = q->scaled_quantum;
483  }
484  if (++sch->q.qlen <= q->limit)
485  return NET_XMIT_SUCCESS;
486 
487  qlen = slot->qlen;
488  sfq_drop(sch);
489  /* Return Congestion Notification only if we dropped a packet
490  * from this flow.
491  */
492  if (qlen != slot->qlen)
493  return NET_XMIT_CN;
494 
495  /* As we dropped a packet, better let upper stack know this */
496  qdisc_tree_decrease_qlen(sch, 1);
497  return NET_XMIT_SUCCESS;
498 }
499 
500 static struct sk_buff *
501 sfq_dequeue(struct Qdisc *sch)
502 {
503  struct sfq_sched_data *q = qdisc_priv(sch);
504  struct sk_buff *skb;
505  sfq_index a, next_a;
506  struct sfq_slot *slot;
507 
508  /* No active slots */
509  if (q->tail == NULL)
510  return NULL;
511 
512 next_slot:
513  a = q->tail->next;
514  slot = &q->slots[a];
515  if (slot->allot <= 0) {
516  q->tail = slot;
517  slot->allot += q->scaled_quantum;
518  goto next_slot;
519  }
520  skb = slot_dequeue_head(slot);
521  sfq_dec(q, a);
522  qdisc_bstats_update(sch, skb);
523  sch->q.qlen--;
524  sch->qstats.backlog -= qdisc_pkt_len(skb);
525  slot->backlog -= qdisc_pkt_len(skb);
526  /* Is the slot empty? */
527  if (slot->qlen == 0) {
528  q->ht[slot->hash] = SFQ_EMPTY_SLOT;
529  next_a = slot->next;
530  if (a == next_a) {
531  q->tail = NULL; /* no more active slots */
532  return skb;
533  }
534  q->tail->next = next_a;
535  } else {
536  slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
537  }
538  return skb;
539 }
540 
541 static void
542 sfq_reset(struct Qdisc *sch)
543 {
544  struct sk_buff *skb;
545 
546  while ((skb = sfq_dequeue(sch)) != NULL)
547  kfree_skb(skb);
548 }
549 
550 /*
551  * When q->perturbation is changed, we rehash all queued skbs
552  * to avoid OOO (Out Of Order) effects.
553  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
554  * counters.
555  */
556 static void sfq_rehash(struct Qdisc *sch)
557 {
558  struct sfq_sched_data *q = qdisc_priv(sch);
559  struct sk_buff *skb;
560  int i;
561  struct sfq_slot *slot;
562  struct sk_buff_head list;
563  int dropped = 0;
564 
565  __skb_queue_head_init(&list);
566 
567  for (i = 0; i < q->maxflows; i++) {
568  slot = &q->slots[i];
569  if (!slot->qlen)
570  continue;
571  while (slot->qlen) {
572  skb = slot_dequeue_head(slot);
573  sfq_dec(q, i);
574  __skb_queue_tail(&list, skb);
575  }
576  slot->backlog = 0;
577  red_set_vars(&slot->vars);
578  q->ht[slot->hash] = SFQ_EMPTY_SLOT;
579  }
580  q->tail = NULL;
581 
582  while ((skb = __skb_dequeue(&list)) != NULL) {
583  unsigned int hash = sfq_hash(q, skb);
584  sfq_index x = q->ht[hash];
585 
586  slot = &q->slots[x];
587  if (x == SFQ_EMPTY_SLOT) {
588  x = q->dep[0].next; /* get a free slot */
589  if (x >= SFQ_MAX_FLOWS) {
590 drop: sch->qstats.backlog -= qdisc_pkt_len(skb);
591  kfree_skb(skb);
592  dropped++;
593  continue;
594  }
595  q->ht[hash] = x;
596  slot = &q->slots[x];
597  slot->hash = hash;
598  }
599  if (slot->qlen >= q->maxdepth)
600  goto drop;
601  slot_queue_add(slot, skb);
602  if (q->red_parms)
603  slot->vars.qavg = red_calc_qavg(q->red_parms,
604  &slot->vars,
605  slot->backlog);
606  slot->backlog += qdisc_pkt_len(skb);
607  sfq_inc(q, x);
608  if (slot->qlen == 1) { /* The flow is new */
609  if (q->tail == NULL) { /* It is the first flow */
610  slot->next = x;
611  } else {
612  slot->next = q->tail->next;
613  q->tail->next = x;
614  }
615  q->tail = slot;
616  slot->allot = q->scaled_quantum;
617  }
618  }
619  sch->q.qlen -= dropped;
620  qdisc_tree_decrease_qlen(sch, dropped);
621 }
622 
623 static void sfq_perturbation(unsigned long arg)
624 {
625  struct Qdisc *sch = (struct Qdisc *)arg;
626  struct sfq_sched_data *q = qdisc_priv(sch);
627  spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
628 
629  spin_lock(root_lock);
630  q->perturbation = net_random();
631  if (!q->filter_list && q->tail)
632  sfq_rehash(sch);
633  spin_unlock(root_lock);
634 
635  if (q->perturb_period)
636  mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
637 }
638 
639 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
640 {
641  struct sfq_sched_data *q = qdisc_priv(sch);
642  struct tc_sfq_qopt *ctl = nla_data(opt);
643  struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
644  unsigned int qlen;
645  struct red_parms *p = NULL;
646 
647  if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
648  return -EINVAL;
649  if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
650  ctl_v1 = nla_data(opt);
651  if (ctl->divisor &&
652  (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
653  return -EINVAL;
654  if (ctl_v1 && ctl_v1->qth_min) {
655  p = kmalloc(sizeof(*p), GFP_KERNEL);
656  if (!p)
657  return -ENOMEM;
658  }
659  sch_tree_lock(sch);
660  if (ctl->quantum) {
661  q->quantum = ctl->quantum;
663  }
664  q->perturb_period = ctl->perturb_period * HZ;
665  if (ctl->flows)
666  q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
667  if (ctl->divisor) {
668  q->divisor = ctl->divisor;
669  q->maxflows = min_t(u32, q->maxflows, q->divisor);
670  }
671  if (ctl_v1) {
672  if (ctl_v1->depth)
673  q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
674  if (p) {
675  swap(q->red_parms, p);
676  red_set_parms(q->red_parms,
677  ctl_v1->qth_min, ctl_v1->qth_max,
678  ctl_v1->Wlog,
679  ctl_v1->Plog, ctl_v1->Scell_log,
680  NULL,
681  ctl_v1->max_P);
682  }
683  q->flags = ctl_v1->flags;
684  q->headdrop = ctl_v1->headdrop;
685  }
686  if (ctl->limit) {
687  q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
688  q->maxflows = min_t(u32, q->maxflows, q->limit);
689  }
690 
691  qlen = sch->q.qlen;
692  while (sch->q.qlen > q->limit)
693  sfq_drop(sch);
694  qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
695 
697  if (q->perturb_period) {
698  mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
699  q->perturbation = net_random();
700  }
701  sch_tree_unlock(sch);
702  kfree(p);
703  return 0;
704 }
705 
706 static void *sfq_alloc(size_t sz)
707 {
708  void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
709 
710  if (!ptr)
711  ptr = vmalloc(sz);
712  return ptr;
713 }
714 
715 static void sfq_free(void *addr)
716 {
717  if (addr) {
718  if (is_vmalloc_addr(addr))
719  vfree(addr);
720  else
721  kfree(addr);
722  }
723 }
724 
725 static void sfq_destroy(struct Qdisc *sch)
726 {
727  struct sfq_sched_data *q = qdisc_priv(sch);
728 
730  q->perturb_period = 0;
732  sfq_free(q->ht);
733  sfq_free(q->slots);
734  kfree(q->red_parms);
735 }
736 
737 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
738 {
739  struct sfq_sched_data *q = qdisc_priv(sch);
740  int i;
741 
742  q->perturb_timer.function = sfq_perturbation;
743  q->perturb_timer.data = (unsigned long)sch;
745 
746  for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
747  q->dep[i].next = i + SFQ_MAX_FLOWS;
748  q->dep[i].prev = i + SFQ_MAX_FLOWS;
749  }
750 
751  q->limit = SFQ_MAX_DEPTH;
752  q->maxdepth = SFQ_MAX_DEPTH;
753  q->cur_depth = 0;
754  q->tail = NULL;
757  q->quantum = psched_mtu(qdisc_dev(sch));
759  q->perturb_period = 0;
760  q->perturbation = net_random();
761 
762  if (opt) {
763  int err = sfq_change(sch, opt);
764  if (err)
765  return err;
766  }
767 
768  q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
769  q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
770  if (!q->ht || !q->slots) {
771  sfq_destroy(sch);
772  return -ENOMEM;
773  }
774  for (i = 0; i < q->divisor; i++)
775  q->ht[i] = SFQ_EMPTY_SLOT;
776 
777  for (i = 0; i < q->maxflows; i++) {
778  slot_queue_init(&q->slots[i]);
779  sfq_link(q, i);
780  }
781  if (q->limit >= 1)
782  sch->flags |= TCQ_F_CAN_BYPASS;
783  else
784  sch->flags &= ~TCQ_F_CAN_BYPASS;
785  return 0;
786 }
787 
788 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
789 {
790  struct sfq_sched_data *q = qdisc_priv(sch);
791  unsigned char *b = skb_tail_pointer(skb);
792  struct tc_sfq_qopt_v1 opt;
793  struct red_parms *p = q->red_parms;
794 
795  memset(&opt, 0, sizeof(opt));
796  opt.v0.quantum = q->quantum;
797  opt.v0.perturb_period = q->perturb_period / HZ;
798  opt.v0.limit = q->limit;
799  opt.v0.divisor = q->divisor;
800  opt.v0.flows = q->maxflows;
801  opt.depth = q->maxdepth;
802  opt.headdrop = q->headdrop;
803 
804  if (p) {
805  opt.qth_min = p->qth_min >> p->Wlog;
806  opt.qth_max = p->qth_max >> p->Wlog;
807  opt.Wlog = p->Wlog;
808  opt.Plog = p->Plog;
809  opt.Scell_log = p->Scell_log;
810  opt.max_P = p->max_P;
811  }
812  memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
813  opt.flags = q->flags;
814 
815  if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
816  goto nla_put_failure;
817 
818  return skb->len;
819 
820 nla_put_failure:
821  nlmsg_trim(skb, b);
822  return -1;
823 }
824 
825 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
826 {
827  return NULL;
828 }
829 
830 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
831 {
832  return 0;
833 }
834 
835 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
836  u32 classid)
837 {
838  /* we cannot bypass queue discipline anymore */
839  sch->flags &= ~TCQ_F_CAN_BYPASS;
840  return 0;
841 }
842 
843 static void sfq_put(struct Qdisc *q, unsigned long cl)
844 {
845 }
846 
847 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
848 {
849  struct sfq_sched_data *q = qdisc_priv(sch);
850 
851  if (cl)
852  return NULL;
853  return &q->filter_list;
854 }
855 
856 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
857  struct sk_buff *skb, struct tcmsg *tcm)
858 {
859  tcm->tcm_handle |= TC_H_MIN(cl);
860  return 0;
861 }
862 
863 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
864  struct gnet_dump *d)
865 {
866  struct sfq_sched_data *q = qdisc_priv(sch);
867  sfq_index idx = q->ht[cl - 1];
868  struct gnet_stats_queue qs = { 0 };
869  struct tc_sfq_xstats xstats = { 0 };
870 
871  if (idx != SFQ_EMPTY_SLOT) {
872  const struct sfq_slot *slot = &q->slots[idx];
873 
874  xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
875  qs.qlen = slot->qlen;
876  qs.backlog = slot->backlog;
877  }
878  if (gnet_stats_copy_queue(d, &qs) < 0)
879  return -1;
880  return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
881 }
882 
883 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
884 {
885  struct sfq_sched_data *q = qdisc_priv(sch);
886  unsigned int i;
887 
888  if (arg->stop)
889  return;
890 
891  for (i = 0; i < q->divisor; i++) {
892  if (q->ht[i] == SFQ_EMPTY_SLOT ||
893  arg->count < arg->skip) {
894  arg->count++;
895  continue;
896  }
897  if (arg->fn(sch, i + 1, arg) < 0) {
898  arg->stop = 1;
899  break;
900  }
901  arg->count++;
902  }
903 }
904 
905 static const struct Qdisc_class_ops sfq_class_ops = {
906  .leaf = sfq_leaf,
907  .get = sfq_get,
908  .put = sfq_put,
909  .tcf_chain = sfq_find_tcf,
910  .bind_tcf = sfq_bind,
911  .unbind_tcf = sfq_put,
912  .dump = sfq_dump_class,
913  .dump_stats = sfq_dump_class_stats,
914  .walk = sfq_walk,
915 };
916 
917 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
918  .cl_ops = &sfq_class_ops,
919  .id = "sfq",
920  .priv_size = sizeof(struct sfq_sched_data),
921  .enqueue = sfq_enqueue,
922  .dequeue = sfq_dequeue,
923  .peek = qdisc_peek_dequeued,
924  .drop = sfq_drop,
925  .init = sfq_init,
926  .reset = sfq_reset,
927  .destroy = sfq_destroy,
928  .change = NULL,
929  .dump = sfq_dump,
930  .owner = THIS_MODULE,
931 };
932 
933 static int __init sfq_module_init(void)
934 {
935  return register_qdisc(&sfq_qdisc_ops);
936 }
937 static void __exit sfq_module_exit(void)
938 {
939  unregister_qdisc(&sfq_qdisc_ops);
940 }
941 module_init(sfq_module_init)
942 module_exit(sfq_module_exit)
943 MODULE_LICENSE("GPL");