1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. |
4 | * |
5 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
6 | */ |
7 | |
8 | #include <linux/module.h> |
9 | #include <linux/types.h> |
10 | #include <linux/kernel.h> |
11 | #include <linux/jiffies.h> |
12 | #include <linux/string.h> |
13 | #include <linux/in.h> |
14 | #include <linux/errno.h> |
15 | #include <linux/init.h> |
16 | #include <linux/skbuff.h> |
17 | #include <linux/siphash.h> |
18 | #include <linux/slab.h> |
19 | #include <linux/vmalloc.h> |
20 | #include <net/netlink.h> |
21 | #include <net/pkt_sched.h> |
22 | #include <net/pkt_cls.h> |
23 | #include <net/red.h> |
24 | |
25 | |
26 | /* Stochastic Fairness Queuing algorithm. |
27 | ======================================= |
28 | |
29 | Source: |
30 | Paul E. McKenney "Stochastic Fairness Queuing", |
31 | IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. |
32 | |
33 | Paul E. McKenney "Stochastic Fairness Queuing", |
34 | "Interworking: Research and Experience", v.2, 1991, p.113-131. |
35 | |
36 | |
37 | See also: |
38 | M. Shreedhar and George Varghese "Efficient Fair |
39 | Queuing using Deficit Round Robin", Proc. SIGCOMM 95. |
40 | |
41 | |
42 | This is not the thing that is usually called (W)FQ nowadays. |
43 | It does not use any timestamp mechanism, but instead |
44 | processes queues in round-robin order. |
45 | |
46 | ADVANTAGE: |
47 | |
48 | - It is very cheap. Both CPU and memory requirements are minimal. |
49 | |
50 | DRAWBACKS: |
51 | |
52 | - "Stochastic" -> It is not 100% fair. |
53 | When hash collisions occur, several flows are considered as one. |
54 | |
55 | - "Round-robin" -> It introduces larger delays than virtual clock |
56 | based schemes, and should not be used for isolating interactive |
57 | traffic from non-interactive. It means, that this scheduler |
58 | should be used as leaf of CBQ or P3, which put interactive traffic |
59 | to higher priority band. |
60 | |
61 | We still need true WFQ for top level CSZ, but using WFQ |
62 | for the best effort traffic is absolutely pointless: |
63 | SFQ is superior for this purpose. |
64 | |
65 | IMPLEMENTATION: |
66 | This implementation limits : |
67 | - maximal queue length per flow to 127 packets. |
68 | - max mtu to 2^18-1; |
69 | - max 65408 flows, |
70 | - number of hash buckets to 65536. |
71 | |
72 | It is easy to increase these values, but not in flight. */ |
73 | |
74 | #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */ |
75 | #define SFQ_DEFAULT_FLOWS 128 |
76 | #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ |
77 | #define SFQ_EMPTY_SLOT 0xffff |
78 | #define SFQ_DEFAULT_HASH_DIVISOR 1024 |
79 | |
80 | /* We use 16 bits to store allot, and want to handle packets up to 64K |
81 | * Scale allot by 8 (1<<3) so that no overflow occurs. |
82 | */ |
83 | #define SFQ_ALLOT_SHIFT 3 |
84 | #define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) |
85 | |
86 | /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ |
87 | typedef u16 sfq_index; |
88 | |
89 | /* |
90 | * We dont use pointers to save space. |
91 | * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array |
92 | * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] |
93 | * are 'pointers' to dep[] array |
94 | */ |
95 | struct sfq_head { |
96 | sfq_index next; |
97 | sfq_index prev; |
98 | }; |
99 | |
100 | struct sfq_slot { |
101 | struct sk_buff *skblist_next; |
102 | struct sk_buff *skblist_prev; |
103 | sfq_index qlen; /* number of skbs in skblist */ |
104 | sfq_index next; /* next slot in sfq RR chain */ |
105 | struct sfq_head dep; /* anchor in dep[] chains */ |
106 | unsigned short hash; /* hash value (index in ht[]) */ |
107 | short allot; /* credit for this slot */ |
108 | |
109 | unsigned int backlog; |
110 | struct red_vars vars; |
111 | }; |
112 | |
113 | struct sfq_sched_data { |
114 | /* frequently used fields */ |
115 | int limit; /* limit of total number of packets in this qdisc */ |
116 | unsigned int divisor; /* number of slots in hash table */ |
117 | u8 headdrop; |
118 | u8 maxdepth; /* limit of packets per flow */ |
119 | |
120 | siphash_key_t perturbation; |
121 | u8 cur_depth; /* depth of longest slot */ |
122 | u8 flags; |
123 | unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ |
124 | struct tcf_proto __rcu *filter_list; |
125 | struct tcf_block *block; |
126 | sfq_index *ht; /* Hash table ('divisor' slots) */ |
127 | struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ |
128 | |
129 | struct red_parms *red_parms; |
130 | struct tc_sfqred_stats stats; |
131 | struct sfq_slot *tail; /* current slot in round */ |
132 | |
133 | struct sfq_head dep[SFQ_MAX_DEPTH + 1]; |
134 | /* Linked lists of slots, indexed by depth |
135 | * dep[0] : list of unused flows |
136 | * dep[1] : list of flows with 1 packet |
137 | * dep[X] : list of flows with X packets |
138 | */ |
139 | |
140 | unsigned int maxflows; /* number of flows in flows array */ |
141 | int perturb_period; |
142 | unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ |
143 | struct timer_list perturb_timer; |
144 | struct Qdisc *sch; |
145 | }; |
146 | |
147 | /* |
148 | * sfq_head are either in a sfq_slot or in dep[] array |
149 | */ |
150 | static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) |
151 | { |
152 | if (val < SFQ_MAX_FLOWS) |
153 | return &q->slots[val].dep; |
154 | return &q->dep[val - SFQ_MAX_FLOWS]; |
155 | } |
156 | |
157 | static unsigned int sfq_hash(const struct sfq_sched_data *q, |
158 | const struct sk_buff *skb) |
159 | { |
160 | return skb_get_hash_perturb(skb, perturb: &q->perturbation) & (q->divisor - 1); |
161 | } |
162 | |
163 | static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, |
164 | int *qerr) |
165 | { |
166 | struct sfq_sched_data *q = qdisc_priv(sch); |
167 | struct tcf_result res; |
168 | struct tcf_proto *fl; |
169 | int result; |
170 | |
171 | if (TC_H_MAJ(skb->priority) == sch->handle && |
172 | TC_H_MIN(skb->priority) > 0 && |
173 | TC_H_MIN(skb->priority) <= q->divisor) |
174 | return TC_H_MIN(skb->priority); |
175 | |
176 | fl = rcu_dereference_bh(q->filter_list); |
177 | if (!fl) |
178 | return sfq_hash(q, skb) + 1; |
179 | |
180 | *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; |
181 | result = tcf_classify(skb, NULL, tp: fl, res: &res, compat_mode: false); |
182 | if (result >= 0) { |
183 | #ifdef CONFIG_NET_CLS_ACT |
184 | switch (result) { |
185 | case TC_ACT_STOLEN: |
186 | case TC_ACT_QUEUED: |
187 | case TC_ACT_TRAP: |
188 | *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; |
189 | fallthrough; |
190 | case TC_ACT_SHOT: |
191 | return 0; |
192 | } |
193 | #endif |
194 | if (TC_H_MIN(res.classid) <= q->divisor) |
195 | return TC_H_MIN(res.classid); |
196 | } |
197 | return 0; |
198 | } |
199 | |
200 | /* |
201 | * x : slot number [0 .. SFQ_MAX_FLOWS - 1] |
202 | */ |
203 | static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) |
204 | { |
205 | sfq_index p, n; |
206 | struct sfq_slot *slot = &q->slots[x]; |
207 | int qlen = slot->qlen; |
208 | |
209 | p = qlen + SFQ_MAX_FLOWS; |
210 | n = q->dep[qlen].next; |
211 | |
212 | slot->dep.next = n; |
213 | slot->dep.prev = p; |
214 | |
215 | q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ |
216 | sfq_dep_head(q, val: n)->prev = x; |
217 | } |
218 | |
219 | #define sfq_unlink(q, x, n, p) \ |
220 | do { \ |
221 | n = q->slots[x].dep.next; \ |
222 | p = q->slots[x].dep.prev; \ |
223 | sfq_dep_head(q, p)->next = n; \ |
224 | sfq_dep_head(q, n)->prev = p; \ |
225 | } while (0) |
226 | |
227 | |
228 | static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) |
229 | { |
230 | sfq_index p, n; |
231 | int d; |
232 | |
233 | sfq_unlink(q, x, n, p); |
234 | |
235 | d = q->slots[x].qlen--; |
236 | if (n == p && q->cur_depth == d) |
237 | q->cur_depth--; |
238 | sfq_link(q, x); |
239 | } |
240 | |
241 | static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) |
242 | { |
243 | sfq_index p, n; |
244 | int d; |
245 | |
246 | sfq_unlink(q, x, n, p); |
247 | |
248 | d = ++q->slots[x].qlen; |
249 | if (q->cur_depth < d) |
250 | q->cur_depth = d; |
251 | sfq_link(q, x); |
252 | } |
253 | |
254 | /* helper functions : might be changed when/if skb use a standard list_head */ |
255 | |
256 | /* remove one skb from tail of slot queue */ |
257 | static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) |
258 | { |
259 | struct sk_buff *skb = slot->skblist_prev; |
260 | |
261 | slot->skblist_prev = skb->prev; |
262 | skb->prev->next = (struct sk_buff *)slot; |
263 | skb->next = skb->prev = NULL; |
264 | return skb; |
265 | } |
266 | |
267 | /* remove one skb from head of slot queue */ |
268 | static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) |
269 | { |
270 | struct sk_buff *skb = slot->skblist_next; |
271 | |
272 | slot->skblist_next = skb->next; |
273 | skb->next->prev = (struct sk_buff *)slot; |
274 | skb->next = skb->prev = NULL; |
275 | return skb; |
276 | } |
277 | |
278 | static inline void slot_queue_init(struct sfq_slot *slot) |
279 | { |
280 | memset(slot, 0, sizeof(*slot)); |
281 | slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; |
282 | } |
283 | |
284 | /* add skb to slot queue (tail add) */ |
285 | static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) |
286 | { |
287 | skb->prev = slot->skblist_prev; |
288 | skb->next = (struct sk_buff *)slot; |
289 | slot->skblist_prev->next = skb; |
290 | slot->skblist_prev = skb; |
291 | } |
292 | |
293 | static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) |
294 | { |
295 | struct sfq_sched_data *q = qdisc_priv(sch); |
296 | sfq_index x, d = q->cur_depth; |
297 | struct sk_buff *skb; |
298 | unsigned int len; |
299 | struct sfq_slot *slot; |
300 | |
301 | /* Queue is full! Find the longest slot and drop tail packet from it */ |
302 | if (d > 1) { |
303 | x = q->dep[d].next; |
304 | slot = &q->slots[x]; |
305 | drop: |
306 | skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); |
307 | len = qdisc_pkt_len(skb); |
308 | slot->backlog -= len; |
309 | sfq_dec(q, x); |
310 | sch->q.qlen--; |
311 | qdisc_qstats_backlog_dec(sch, skb); |
312 | qdisc_drop(skb, sch, to_free); |
313 | return len; |
314 | } |
315 | |
316 | if (d == 1) { |
317 | /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ |
318 | x = q->tail->next; |
319 | slot = &q->slots[x]; |
320 | q->tail->next = slot->next; |
321 | q->ht[slot->hash] = SFQ_EMPTY_SLOT; |
322 | goto drop; |
323 | } |
324 | |
325 | return 0; |
326 | } |
327 | |
328 | /* Is ECN parameter configured */ |
329 | static int sfq_prob_mark(const struct sfq_sched_data *q) |
330 | { |
331 | return q->flags & TC_RED_ECN; |
332 | } |
333 | |
334 | /* Should packets over max threshold just be marked */ |
335 | static int sfq_hard_mark(const struct sfq_sched_data *q) |
336 | { |
337 | return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; |
338 | } |
339 | |
340 | static int sfq_headdrop(const struct sfq_sched_data *q) |
341 | { |
342 | return q->headdrop; |
343 | } |
344 | |
345 | static int |
346 | sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) |
347 | { |
348 | struct sfq_sched_data *q = qdisc_priv(sch); |
349 | unsigned int hash, dropped; |
350 | sfq_index x, qlen; |
351 | struct sfq_slot *slot; |
352 | int ret; |
353 | struct sk_buff *head; |
354 | int delta; |
355 | |
356 | hash = sfq_classify(skb, sch, qerr: &ret); |
357 | if (hash == 0) { |
358 | if (ret & __NET_XMIT_BYPASS) |
359 | qdisc_qstats_drop(sch); |
360 | __qdisc_drop(skb, to_free); |
361 | return ret; |
362 | } |
363 | hash--; |
364 | |
365 | x = q->ht[hash]; |
366 | slot = &q->slots[x]; |
367 | if (x == SFQ_EMPTY_SLOT) { |
368 | x = q->dep[0].next; /* get a free slot */ |
369 | if (x >= SFQ_MAX_FLOWS) |
370 | return qdisc_drop(skb, sch, to_free); |
371 | q->ht[hash] = x; |
372 | slot = &q->slots[x]; |
373 | slot->hash = hash; |
374 | slot->backlog = 0; /* should already be 0 anyway... */ |
375 | red_set_vars(v: &slot->vars); |
376 | goto enqueue; |
377 | } |
378 | if (q->red_parms) { |
379 | slot->vars.qavg = red_calc_qavg_no_idle_time(p: q->red_parms, |
380 | v: &slot->vars, |
381 | backlog: slot->backlog); |
382 | switch (red_action(p: q->red_parms, |
383 | v: &slot->vars, |
384 | qavg: slot->vars.qavg)) { |
385 | case RED_DONT_MARK: |
386 | break; |
387 | |
388 | case RED_PROB_MARK: |
389 | qdisc_qstats_overlimit(sch); |
390 | if (sfq_prob_mark(q)) { |
391 | /* We know we have at least one packet in queue */ |
392 | if (sfq_headdrop(q) && |
393 | INET_ECN_set_ce(skb: slot->skblist_next)) { |
394 | q->stats.prob_mark_head++; |
395 | break; |
396 | } |
397 | if (INET_ECN_set_ce(skb)) { |
398 | q->stats.prob_mark++; |
399 | break; |
400 | } |
401 | } |
402 | q->stats.prob_drop++; |
403 | goto congestion_drop; |
404 | |
405 | case RED_HARD_MARK: |
406 | qdisc_qstats_overlimit(sch); |
407 | if (sfq_hard_mark(q)) { |
408 | /* We know we have at least one packet in queue */ |
409 | if (sfq_headdrop(q) && |
410 | INET_ECN_set_ce(skb: slot->skblist_next)) { |
411 | q->stats.forced_mark_head++; |
412 | break; |
413 | } |
414 | if (INET_ECN_set_ce(skb)) { |
415 | q->stats.forced_mark++; |
416 | break; |
417 | } |
418 | } |
419 | q->stats.forced_drop++; |
420 | goto congestion_drop; |
421 | } |
422 | } |
423 | |
424 | if (slot->qlen >= q->maxdepth) { |
425 | congestion_drop: |
426 | if (!sfq_headdrop(q)) |
427 | return qdisc_drop(skb, sch, to_free); |
428 | |
429 | /* We know we have at least one packet in queue */ |
430 | head = slot_dequeue_head(slot); |
431 | delta = qdisc_pkt_len(skb: head) - qdisc_pkt_len(skb); |
432 | sch->qstats.backlog -= delta; |
433 | slot->backlog -= delta; |
434 | qdisc_drop(skb: head, sch, to_free); |
435 | |
436 | slot_queue_add(slot, skb); |
437 | qdisc_tree_reduce_backlog(qdisc: sch, n: 0, len: delta); |
438 | return NET_XMIT_CN; |
439 | } |
440 | |
441 | enqueue: |
442 | qdisc_qstats_backlog_inc(sch, skb); |
443 | slot->backlog += qdisc_pkt_len(skb); |
444 | slot_queue_add(slot, skb); |
445 | sfq_inc(q, x); |
446 | if (slot->qlen == 1) { /* The flow is new */ |
447 | if (q->tail == NULL) { /* It is the first flow */ |
448 | slot->next = x; |
449 | } else { |
450 | slot->next = q->tail->next; |
451 | q->tail->next = x; |
452 | } |
453 | /* We put this flow at the end of our flow list. |
454 | * This might sound unfair for a new flow to wait after old ones, |
455 | * but we could endup servicing new flows only, and freeze old ones. |
456 | */ |
457 | q->tail = slot; |
458 | /* We could use a bigger initial quantum for new flows */ |
459 | slot->allot = q->scaled_quantum; |
460 | } |
461 | if (++sch->q.qlen <= q->limit) |
462 | return NET_XMIT_SUCCESS; |
463 | |
464 | qlen = slot->qlen; |
465 | dropped = sfq_drop(sch, to_free); |
466 | /* Return Congestion Notification only if we dropped a packet |
467 | * from this flow. |
468 | */ |
469 | if (qlen != slot->qlen) { |
470 | qdisc_tree_reduce_backlog(qdisc: sch, n: 0, len: dropped - qdisc_pkt_len(skb)); |
471 | return NET_XMIT_CN; |
472 | } |
473 | |
474 | /* As we dropped a packet, better let upper stack know this */ |
475 | qdisc_tree_reduce_backlog(qdisc: sch, n: 1, len: dropped); |
476 | return NET_XMIT_SUCCESS; |
477 | } |
478 | |
479 | static struct sk_buff * |
480 | sfq_dequeue(struct Qdisc *sch) |
481 | { |
482 | struct sfq_sched_data *q = qdisc_priv(sch); |
483 | struct sk_buff *skb; |
484 | sfq_index a, next_a; |
485 | struct sfq_slot *slot; |
486 | |
487 | /* No active slots */ |
488 | if (q->tail == NULL) |
489 | return NULL; |
490 | |
491 | next_slot: |
492 | a = q->tail->next; |
493 | slot = &q->slots[a]; |
494 | if (slot->allot <= 0) { |
495 | q->tail = slot; |
496 | slot->allot += q->scaled_quantum; |
497 | goto next_slot; |
498 | } |
499 | skb = slot_dequeue_head(slot); |
500 | sfq_dec(q, x: a); |
501 | qdisc_bstats_update(sch, skb); |
502 | sch->q.qlen--; |
503 | qdisc_qstats_backlog_dec(sch, skb); |
504 | slot->backlog -= qdisc_pkt_len(skb); |
505 | /* Is the slot empty? */ |
506 | if (slot->qlen == 0) { |
507 | q->ht[slot->hash] = SFQ_EMPTY_SLOT; |
508 | next_a = slot->next; |
509 | if (a == next_a) { |
510 | q->tail = NULL; /* no more active slots */ |
511 | return skb; |
512 | } |
513 | q->tail->next = next_a; |
514 | } else { |
515 | slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb)); |
516 | } |
517 | return skb; |
518 | } |
519 | |
520 | static void |
521 | sfq_reset(struct Qdisc *sch) |
522 | { |
523 | struct sk_buff *skb; |
524 | |
525 | while ((skb = sfq_dequeue(sch)) != NULL) |
526 | rtnl_kfree_skbs(head: skb, tail: skb); |
527 | } |
528 | |
529 | /* |
530 | * When q->perturbation is changed, we rehash all queued skbs |
531 | * to avoid OOO (Out Of Order) effects. |
532 | * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change |
533 | * counters. |
534 | */ |
535 | static void sfq_rehash(struct Qdisc *sch) |
536 | { |
537 | struct sfq_sched_data *q = qdisc_priv(sch); |
538 | struct sk_buff *skb; |
539 | int i; |
540 | struct sfq_slot *slot; |
541 | struct sk_buff_head list; |
542 | int dropped = 0; |
543 | unsigned int drop_len = 0; |
544 | |
545 | __skb_queue_head_init(list: &list); |
546 | |
547 | for (i = 0; i < q->maxflows; i++) { |
548 | slot = &q->slots[i]; |
549 | if (!slot->qlen) |
550 | continue; |
551 | while (slot->qlen) { |
552 | skb = slot_dequeue_head(slot); |
553 | sfq_dec(q, x: i); |
554 | __skb_queue_tail(list: &list, newsk: skb); |
555 | } |
556 | slot->backlog = 0; |
557 | red_set_vars(v: &slot->vars); |
558 | q->ht[slot->hash] = SFQ_EMPTY_SLOT; |
559 | } |
560 | q->tail = NULL; |
561 | |
562 | while ((skb = __skb_dequeue(list: &list)) != NULL) { |
563 | unsigned int hash = sfq_hash(q, skb); |
564 | sfq_index x = q->ht[hash]; |
565 | |
566 | slot = &q->slots[x]; |
567 | if (x == SFQ_EMPTY_SLOT) { |
568 | x = q->dep[0].next; /* get a free slot */ |
569 | if (x >= SFQ_MAX_FLOWS) { |
570 | drop: |
571 | qdisc_qstats_backlog_dec(sch, skb); |
572 | drop_len += qdisc_pkt_len(skb); |
573 | kfree_skb(skb); |
574 | dropped++; |
575 | continue; |
576 | } |
577 | q->ht[hash] = x; |
578 | slot = &q->slots[x]; |
579 | slot->hash = hash; |
580 | } |
581 | if (slot->qlen >= q->maxdepth) |
582 | goto drop; |
583 | slot_queue_add(slot, skb); |
584 | if (q->red_parms) |
585 | slot->vars.qavg = red_calc_qavg(p: q->red_parms, |
586 | v: &slot->vars, |
587 | backlog: slot->backlog); |
588 | slot->backlog += qdisc_pkt_len(skb); |
589 | sfq_inc(q, x); |
590 | if (slot->qlen == 1) { /* The flow is new */ |
591 | if (q->tail == NULL) { /* It is the first flow */ |
592 | slot->next = x; |
593 | } else { |
594 | slot->next = q->tail->next; |
595 | q->tail->next = x; |
596 | } |
597 | q->tail = slot; |
598 | slot->allot = q->scaled_quantum; |
599 | } |
600 | } |
601 | sch->q.qlen -= dropped; |
602 | qdisc_tree_reduce_backlog(qdisc: sch, n: dropped, len: drop_len); |
603 | } |
604 | |
605 | static void sfq_perturbation(struct timer_list *t) |
606 | { |
607 | struct sfq_sched_data *q = from_timer(q, t, perturb_timer); |
608 | struct Qdisc *sch = q->sch; |
609 | spinlock_t *root_lock; |
610 | siphash_key_t nkey; |
611 | |
612 | get_random_bytes(buf: &nkey, len: sizeof(nkey)); |
613 | rcu_read_lock(); |
614 | root_lock = qdisc_lock(qdisc: qdisc_root_sleeping(qdisc: sch)); |
615 | spin_lock(lock: root_lock); |
616 | q->perturbation = nkey; |
617 | if (!q->filter_list && q->tail) |
618 | sfq_rehash(sch); |
619 | spin_unlock(lock: root_lock); |
620 | |
621 | if (q->perturb_period) |
622 | mod_timer(timer: &q->perturb_timer, expires: jiffies + q->perturb_period); |
623 | rcu_read_unlock(); |
624 | } |
625 | |
626 | static int sfq_change(struct Qdisc *sch, struct nlattr *opt) |
627 | { |
628 | struct sfq_sched_data *q = qdisc_priv(sch); |
629 | struct tc_sfq_qopt *ctl = nla_data(nla: opt); |
630 | struct tc_sfq_qopt_v1 *ctl_v1 = NULL; |
631 | unsigned int qlen, dropped = 0; |
632 | struct red_parms *p = NULL; |
633 | struct sk_buff *to_free = NULL; |
634 | struct sk_buff *tail = NULL; |
635 | |
636 | if (opt->nla_len < nla_attr_size(payload: sizeof(*ctl))) |
637 | return -EINVAL; |
638 | if (opt->nla_len >= nla_attr_size(payload: sizeof(*ctl_v1))) |
639 | ctl_v1 = nla_data(nla: opt); |
640 | if (ctl->divisor && |
641 | (!is_power_of_2(n: ctl->divisor) || ctl->divisor > 65536)) |
642 | return -EINVAL; |
643 | |
644 | /* slot->allot is a short, make sure quantum is not too big. */ |
645 | if (ctl->quantum) { |
646 | unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum); |
647 | |
648 | if (scaled <= 0 || scaled > SHRT_MAX) |
649 | return -EINVAL; |
650 | } |
651 | |
652 | if (ctl_v1 && !red_check_params(qth_min: ctl_v1->qth_min, qth_max: ctl_v1->qth_max, |
653 | Wlog: ctl_v1->Wlog, Scell_log: ctl_v1->Scell_log, NULL)) |
654 | return -EINVAL; |
655 | if (ctl_v1 && ctl_v1->qth_min) { |
656 | p = kmalloc(size: sizeof(*p), GFP_KERNEL); |
657 | if (!p) |
658 | return -ENOMEM; |
659 | } |
660 | sch_tree_lock(q: sch); |
661 | if (ctl->quantum) { |
662 | q->quantum = ctl->quantum; |
663 | q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); |
664 | } |
665 | q->perturb_period = ctl->perturb_period * HZ; |
666 | if (ctl->flows) |
667 | q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); |
668 | if (ctl->divisor) { |
669 | q->divisor = ctl->divisor; |
670 | q->maxflows = min_t(u32, q->maxflows, q->divisor); |
671 | } |
672 | if (ctl_v1) { |
673 | if (ctl_v1->depth) |
674 | q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); |
675 | if (p) { |
676 | swap(q->red_parms, p); |
677 | red_set_parms(p: q->red_parms, |
678 | qth_min: ctl_v1->qth_min, qth_max: ctl_v1->qth_max, |
679 | Wlog: ctl_v1->Wlog, |
680 | Plog: ctl_v1->Plog, Scell_log: ctl_v1->Scell_log, |
681 | NULL, |
682 | max_P: ctl_v1->max_P); |
683 | } |
684 | q->flags = ctl_v1->flags; |
685 | q->headdrop = ctl_v1->headdrop; |
686 | } |
687 | if (ctl->limit) { |
688 | q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); |
689 | q->maxflows = min_t(u32, q->maxflows, q->limit); |
690 | } |
691 | |
692 | qlen = sch->q.qlen; |
693 | while (sch->q.qlen > q->limit) { |
694 | dropped += sfq_drop(sch, to_free: &to_free); |
695 | if (!tail) |
696 | tail = to_free; |
697 | } |
698 | |
699 | rtnl_kfree_skbs(head: to_free, tail); |
700 | qdisc_tree_reduce_backlog(qdisc: sch, n: qlen - sch->q.qlen, len: dropped); |
701 | |
702 | del_timer(timer: &q->perturb_timer); |
703 | if (q->perturb_period) { |
704 | mod_timer(timer: &q->perturb_timer, expires: jiffies + q->perturb_period); |
705 | get_random_bytes(buf: &q->perturbation, len: sizeof(q->perturbation)); |
706 | } |
707 | sch_tree_unlock(q: sch); |
708 | kfree(objp: p); |
709 | return 0; |
710 | } |
711 | |
712 | static void *sfq_alloc(size_t sz) |
713 | { |
714 | return kvmalloc(size: sz, GFP_KERNEL); |
715 | } |
716 | |
717 | static void sfq_free(void *addr) |
718 | { |
719 | kvfree(addr); |
720 | } |
721 | |
722 | static void sfq_destroy(struct Qdisc *sch) |
723 | { |
724 | struct sfq_sched_data *q = qdisc_priv(sch); |
725 | |
726 | tcf_block_put(block: q->block); |
727 | q->perturb_period = 0; |
728 | del_timer_sync(timer: &q->perturb_timer); |
729 | sfq_free(addr: q->ht); |
730 | sfq_free(addr: q->slots); |
731 | kfree(objp: q->red_parms); |
732 | } |
733 | |
734 | static int sfq_init(struct Qdisc *sch, struct nlattr *opt, |
735 | struct netlink_ext_ack *extack) |
736 | { |
737 | struct sfq_sched_data *q = qdisc_priv(sch); |
738 | int i; |
739 | int err; |
740 | |
741 | q->sch = sch; |
742 | timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); |
743 | |
744 | err = tcf_block_get(p_block: &q->block, p_filter_chain: &q->filter_list, q: sch, extack); |
745 | if (err) |
746 | return err; |
747 | |
748 | for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { |
749 | q->dep[i].next = i + SFQ_MAX_FLOWS; |
750 | q->dep[i].prev = i + SFQ_MAX_FLOWS; |
751 | } |
752 | |
753 | q->limit = SFQ_MAX_DEPTH; |
754 | q->maxdepth = SFQ_MAX_DEPTH; |
755 | q->cur_depth = 0; |
756 | q->tail = NULL; |
757 | q->divisor = SFQ_DEFAULT_HASH_DIVISOR; |
758 | q->maxflows = SFQ_DEFAULT_FLOWS; |
759 | q->quantum = psched_mtu(dev: qdisc_dev(qdisc: sch)); |
760 | q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); |
761 | q->perturb_period = 0; |
762 | get_random_bytes(buf: &q->perturbation, len: sizeof(q->perturbation)); |
763 | |
764 | if (opt) { |
765 | int err = sfq_change(sch, opt); |
766 | if (err) |
767 | return err; |
768 | } |
769 | |
770 | q->ht = sfq_alloc(sz: sizeof(q->ht[0]) * q->divisor); |
771 | q->slots = sfq_alloc(sz: sizeof(q->slots[0]) * q->maxflows); |
772 | if (!q->ht || !q->slots) { |
773 | /* Note: sfq_destroy() will be called by our caller */ |
774 | return -ENOMEM; |
775 | } |
776 | |
777 | for (i = 0; i < q->divisor; i++) |
778 | q->ht[i] = SFQ_EMPTY_SLOT; |
779 | |
780 | for (i = 0; i < q->maxflows; i++) { |
781 | slot_queue_init(slot: &q->slots[i]); |
782 | sfq_link(q, x: i); |
783 | } |
784 | if (q->limit >= 1) |
785 | sch->flags |= TCQ_F_CAN_BYPASS; |
786 | else |
787 | sch->flags &= ~TCQ_F_CAN_BYPASS; |
788 | return 0; |
789 | } |
790 | |
791 | static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) |
792 | { |
793 | struct sfq_sched_data *q = qdisc_priv(sch); |
794 | unsigned char *b = skb_tail_pointer(skb); |
795 | struct tc_sfq_qopt_v1 opt; |
796 | struct red_parms *p = q->red_parms; |
797 | |
798 | memset(&opt, 0, sizeof(opt)); |
799 | opt.v0.quantum = q->quantum; |
800 | opt.v0.perturb_period = q->perturb_period / HZ; |
801 | opt.v0.limit = q->limit; |
802 | opt.v0.divisor = q->divisor; |
803 | opt.v0.flows = q->maxflows; |
804 | opt.depth = q->maxdepth; |
805 | opt.headdrop = q->headdrop; |
806 | |
807 | if (p) { |
808 | opt.qth_min = p->qth_min >> p->Wlog; |
809 | opt.qth_max = p->qth_max >> p->Wlog; |
810 | opt.Wlog = p->Wlog; |
811 | opt.Plog = p->Plog; |
812 | opt.Scell_log = p->Scell_log; |
813 | opt.max_P = p->max_P; |
814 | } |
815 | memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); |
816 | opt.flags = q->flags; |
817 | |
818 | if (nla_put(skb, attrtype: TCA_OPTIONS, attrlen: sizeof(opt), data: &opt)) |
819 | goto nla_put_failure; |
820 | |
821 | return skb->len; |
822 | |
823 | nla_put_failure: |
824 | nlmsg_trim(skb, mark: b); |
825 | return -1; |
826 | } |
827 | |
828 | static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg) |
829 | { |
830 | return NULL; |
831 | } |
832 | |
833 | static unsigned long sfq_find(struct Qdisc *sch, u32 classid) |
834 | { |
835 | return 0; |
836 | } |
837 | |
838 | static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, |
839 | u32 classid) |
840 | { |
841 | return 0; |
842 | } |
843 | |
844 | static void sfq_unbind(struct Qdisc *q, unsigned long cl) |
845 | { |
846 | } |
847 | |
848 | static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl, |
849 | struct netlink_ext_ack *extack) |
850 | { |
851 | struct sfq_sched_data *q = qdisc_priv(sch); |
852 | |
853 | if (cl) |
854 | return NULL; |
855 | return q->block; |
856 | } |
857 | |
858 | static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, |
859 | struct sk_buff *skb, struct tcmsg *tcm) |
860 | { |
861 | tcm->tcm_handle |= TC_H_MIN(cl); |
862 | return 0; |
863 | } |
864 | |
865 | static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, |
866 | struct gnet_dump *d) |
867 | { |
868 | struct sfq_sched_data *q = qdisc_priv(sch); |
869 | sfq_index idx = q->ht[cl - 1]; |
870 | struct gnet_stats_queue qs = { 0 }; |
871 | struct tc_sfq_xstats xstats = { 0 }; |
872 | |
873 | if (idx != SFQ_EMPTY_SLOT) { |
874 | const struct sfq_slot *slot = &q->slots[idx]; |
875 | |
876 | xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; |
877 | qs.qlen = slot->qlen; |
878 | qs.backlog = slot->backlog; |
879 | } |
880 | if (gnet_stats_copy_queue(d, NULL, q: &qs, qlen: qs.qlen) < 0) |
881 | return -1; |
882 | return gnet_stats_copy_app(d, st: &xstats, len: sizeof(xstats)); |
883 | } |
884 | |
885 | static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) |
886 | { |
887 | struct sfq_sched_data *q = qdisc_priv(sch); |
888 | unsigned int i; |
889 | |
890 | if (arg->stop) |
891 | return; |
892 | |
893 | for (i = 0; i < q->divisor; i++) { |
894 | if (q->ht[i] == SFQ_EMPTY_SLOT) { |
895 | arg->count++; |
896 | continue; |
897 | } |
898 | if (!tc_qdisc_stats_dump(sch, cl: i + 1, arg)) |
899 | break; |
900 | } |
901 | } |
902 | |
903 | static const struct Qdisc_class_ops sfq_class_ops = { |
904 | .leaf = sfq_leaf, |
905 | .find = sfq_find, |
906 | .tcf_block = sfq_tcf_block, |
907 | .bind_tcf = sfq_bind, |
908 | .unbind_tcf = sfq_unbind, |
909 | .dump = sfq_dump_class, |
910 | .dump_stats = sfq_dump_class_stats, |
911 | .walk = sfq_walk, |
912 | }; |
913 | |
914 | static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { |
915 | .cl_ops = &sfq_class_ops, |
916 | .id = "sfq" , |
917 | .priv_size = sizeof(struct sfq_sched_data), |
918 | .enqueue = sfq_enqueue, |
919 | .dequeue = sfq_dequeue, |
920 | .peek = qdisc_peek_dequeued, |
921 | .init = sfq_init, |
922 | .reset = sfq_reset, |
923 | .destroy = sfq_destroy, |
924 | .change = NULL, |
925 | .dump = sfq_dump, |
926 | .owner = THIS_MODULE, |
927 | }; |
928 | |
929 | static int __init sfq_module_init(void) |
930 | { |
931 | return register_qdisc(qops: &sfq_qdisc_ops); |
932 | } |
933 | static void __exit sfq_module_exit(void) |
934 | { |
935 | unregister_qdisc(qops: &sfq_qdisc_ops); |
936 | } |
937 | module_init(sfq_module_init) |
938 | module_exit(sfq_module_exit) |
939 | MODULE_LICENSE("GPL" ); |
940 | |