sch_sfq.c source code [linux/net/sched/sch_sfq.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
4	*
5	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6	*/
7
8	#include <linux/module.h>
9	#include <linux/types.h>
10	#include <linux/kernel.h>
11	#include <linux/jiffies.h>
12	#include <linux/string.h>
13	#include <linux/in.h>
14	#include <linux/errno.h>
15	#include <linux/init.h>
16	#include <linux/skbuff.h>
17	#include <linux/siphash.h>
18	#include <linux/slab.h>
19	#include <linux/vmalloc.h>
20	#include <net/netlink.h>
21	#include <net/pkt_sched.h>
22	#include <net/pkt_cls.h>
23	#include <net/red.h>
24
25
26	/ Stochastic Fairness Queuing algorithm.*
27	=======================================
28
29	Source:
30	Paul E. McKenney "Stochastic Fairness Queuing",
31	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32
33	Paul E. McKenney "Stochastic Fairness Queuing",
34	"Interworking: Research and Experience", v.2, 1991, p.113-131.
35
36
37	See also:
38	M. Shreedhar and George Varghese "Efficient Fair
39	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40
41
42	This is not the thing that is usually called (W)FQ nowadays.
43	It does not use any timestamp mechanism, but instead
44	processes queues in round-robin order.
45
46	ADVANTAGE:
47
48	- It is very cheap. Both CPU and memory requirements are minimal.
49
50	DRAWBACKS:
51
52	- "Stochastic" -> It is not 100% fair.
53	When hash collisions occur, several flows are considered as one.
54
55	- "Round-robin" -> It introduces larger delays than virtual clock
56	based schemes, and should not be used for isolating interactive
57	traffic from non-interactive. It means, that this scheduler
58	should be used as leaf of CBQ or P3, which put interactive traffic
59	to higher priority band.
60
61	We still need true WFQ for top level CSZ, but using WFQ
62	for the best effort traffic is absolutely pointless:
63	SFQ is superior for this purpose.
64
65	IMPLEMENTATION:
66	This implementation limits :
67	- maximal queue length per flow to 127 packets.
68	- max mtu to 2^18-1;
69	- max 65408 flows,
70	- number of hash buckets to 65536.
71
72	It is easy to increase these values, but not in flight. /*
73
74	#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
75	#define SFQ_DEFAULT_FLOWS 128
76	#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77	#define SFQ_EMPTY_SLOT 0xffff
78	#define SFQ_DEFAULT_HASH_DIVISOR 1024
79
80	/ We use 16 bits to store allot, and want to handle packets up to 64K*
81	* Scale allot by 8 (1<<3) so that no overflow occurs.
82	*/
83	#define SFQ_ALLOT_SHIFT 3
84	#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
85
86	/ This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values /
87	typedef u16 sfq_index;
88
89	/*
90	* We dont use pointers to save space.
91	* Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
92	* while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
93	* are 'pointers' to dep[] array
94	*/
95	struct sfq_head {
96	sfq_index next;
97	sfq_index prev;
98	};
99
100	struct sfq_slot {
101	struct sk_buff *skblist_next;
102	struct sk_buff *skblist_prev;
103	sfq_index qlen; / number of skbs in skblist /
104	sfq_index next; / next slot in sfq RR chain /
105	struct sfq_head dep; / anchor in dep[] chains /
106	unsigned short hash; / hash value (index in ht[]) /
107	short allot; / credit for this slot /
108
109	unsigned int backlog;
110	struct red_vars vars;
111	};
112
113	struct sfq_sched_data {
114	/ frequently used fields /
115	int limit; / limit of total number of packets in this qdisc /
116	unsigned int divisor; / number of slots in hash table /
117	u8 headdrop;
118	u8 maxdepth; / limit of packets per flow /
119
120	siphash_key_t perturbation;
121	u8 cur_depth; / depth of longest slot /
122	u8 flags;
123	unsigned short scaled_quantum; / SFQ_ALLOT_SIZE(quantum) /
124	struct tcf_proto __rcu *filter_list;
125	struct tcf_block *block;
126	sfq_index ht; /* Hash table ('divisor' slots) /
127	struct sfq_slot slots; /* Flows table ('maxflows' entries) /
128
129	struct red_parms *red_parms;
130	struct tc_sfqred_stats stats;
131	struct sfq_slot tail; /* current slot in round /
132
133	struct sfq_head dep[SFQ_MAX_DEPTH + `1`];
134	/ Linked lists of slots, indexed by depth*
135	* dep[0] : list of unused flows
136	* dep[1] : list of flows with 1 packet
137	* dep[X] : list of flows with X packets
138	*/
139
140	unsigned int maxflows; / number of flows in flows array /
141	int perturb_period;
142	unsigned int quantum; / Allotment per round: MUST BE >= MTU /
143	struct timer_list perturb_timer;
144	struct Qdisc *sch;
145	};
146
147	/*
148	* sfq_head are either in a sfq_slot or in dep[] array
149	*/
150	static inline struct sfq_head sfq_dep_head(struct* sfq_sched_data *q, sfq_index val)
151	{
152	if (val < SFQ_MAX_FLOWS)
153	return &q->slots[val].dep;
154	return &q->dep[val - SFQ_MAX_FLOWS];
155	}
156
157	static unsigned int sfq_hash(const struct sfq_sched_data *q,
158	const struct sk_buff *skb)
159	{
160	return skb_get_hash_perturb(skb, perturb: &q->perturbation) & (q->divisor - `1`);
161	}
162
163	static unsigned int sfq_classify(struct sk_buff skb, struct* Qdisc *sch,
164	int *qerr)
165	{
166	struct sfq_sched_data *q = qdisc_priv(sch);
167	struct tcf_result res;
168	struct tcf_proto *fl;
169	int result;
170
171	if (TC_H_MAJ(skb->priority) == sch->handle &&
172	TC_H_MIN(skb->priority) > `0` &&
173	TC_H_MIN(skb->priority) <= q->divisor)
174	return TC_H_MIN(skb->priority);
175
176	fl = rcu_dereference_bh(q->filter_list);
177	if (!fl)
178	return sfq_hash(q, skb) + `1`;
179
180	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
181	result = tcf_classify(skb, NULL, tp: fl, res: &res, compat_mode: false);
182	if (result >= `0`) {
183	#ifdef CONFIG_NET_CLS_ACT
184	switch (result) {
185	case TC_ACT_STOLEN:
186	case TC_ACT_QUEUED:
187	case TC_ACT_TRAP:
188	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_STOLEN;
189	fallthrough;
190	case TC_ACT_SHOT:
191	return `0`;
192	}
193	#endif
194	if (TC_H_MIN(res.classid) <= q->divisor)
195	return TC_H_MIN(res.classid);
196	}
197	return `0`;
198	}
199
200	/*
201	* x : slot number [0 .. SFQ_MAX_FLOWS - 1]
202	*/
203	static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
204	{
205	sfq_index p, n;
206	struct sfq_slot *slot = &q->slots[x];
207	int qlen = slot->qlen;
208
209	p = qlen + SFQ_MAX_FLOWS;
210	n = q->dep[qlen].next;
211
212	slot->dep.next = n;
213	slot->dep.prev = p;
214
215	q->dep[qlen].next = x; / sfq_dep_head(q, p)->next = x /
216	sfq_dep_head(q, val: n)->prev = x;
217	}
218
219	#define sfq_unlink(q, x, n, p) \
220	do { \
221	n = q->slots[x].dep.next; \
222	p = q->slots[x].dep.prev; \
223	sfq_dep_head(q, p)->next = n; \
224	sfq_dep_head(q, n)->prev = p; \
225	} while (0)
226
227
228	static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
229	{
230	sfq_index p, n;
231	int d;
232
233	sfq_unlink(q, x, n, p);
234
235	d = q->slots[x].qlen--;
236	if (n == p && q->cur_depth == d)
237	q->cur_depth--;
238	sfq_link(q, x);
239	}
240
241	static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
242	{
243	sfq_index p, n;
244	int d;
245
246	sfq_unlink(q, x, n, p);
247
248	d = ++q->slots[x].qlen;
249	if (q->cur_depth < d)
250	q->cur_depth = d;
251	sfq_link(q, x);
252	}
253
254	/ helper functions : might be changed when/if skb use a standard list_head /
255
256	/ remove one skb from tail of slot queue /
257	static inline struct sk_buff slot_dequeue_tail(struct* sfq_slot *slot)
258	{
259	struct sk_buff *skb = slot->skblist_prev;
260
261	slot->skblist_prev = skb->prev;
262	skb->prev->next = (struct sk_buff *)slot;
263	skb->next = skb->prev = NULL;
264	return skb;
265	}
266
267	/ remove one skb from head of slot queue /
268	static inline struct sk_buff slot_dequeue_head(struct* sfq_slot *slot)
269	{
270	struct sk_buff *skb = slot->skblist_next;
271
272	slot->skblist_next = skb->next;
273	skb->next->prev = (struct sk_buff *)slot;
274	skb->next = skb->prev = NULL;
275	return skb;
276	}
277
278	static inline void slot_queue_init(struct sfq_slot *slot)
279	{
280	memset(slot, `0`, sizeof(*slot));
281	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
282	}
283
284	/ add skb to slot queue (tail add) /
285	static inline void slot_queue_add(struct sfq_slot slot, struct* sk_buff *skb)
286	{
287	skb->prev = slot->skblist_prev;
288	skb->next = (struct sk_buff *)slot;
289	slot->skblist_prev->next = skb;
290	slot->skblist_prev = skb;
291	}
292
293	static unsigned int sfq_drop(struct Qdisc sch, struct* sk_buff **to_free)
294	{
295	struct sfq_sched_data *q = qdisc_priv(sch);
296	sfq_index x, d = q->cur_depth;
297	struct sk_buff *skb;
298	unsigned int len;
299	struct sfq_slot *slot;
300
301	/ Queue is full! Find the longest slot and drop tail packet from it /
302	if (d > `1`) {
303	x = q->dep[d].next;
304	slot = &q->slots[x];
305	drop:
306	skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
307	len = qdisc_pkt_len(skb);
308	slot->backlog -= len;
309	sfq_dec(q, x);
310	sch->q.qlen--;
311	qdisc_qstats_backlog_dec(sch, skb);
312	qdisc_drop(skb, sch, to_free);
313	return len;
314	}
315
316	if (d == `1`) {
317	/ It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. /
318	x = q->tail->next;
319	slot = &q->slots[x];
320	q->tail->next = slot->next;
321	q->ht[slot->hash] = SFQ_EMPTY_SLOT;
322	goto drop;
323	}
324
325	return `0`;
326	}
327
328	/ Is ECN parameter configured /
329	static int sfq_prob_mark(const struct sfq_sched_data *q)
330	{
331	return q->flags & TC_RED_ECN;
332	}
333
334	/ Should packets over max threshold just be marked /
335	static int sfq_hard_mark(const struct sfq_sched_data *q)
336	{
337	return (q->flags & (TC_RED_ECN \| TC_RED_HARDDROP)) == TC_RED_ECN;
338	}
339
340	static int sfq_headdrop(const struct sfq_sched_data *q)
341	{
342	return q->headdrop;
343	}
344
345	static int
346	sfq_enqueue(struct sk_buff skb, struct* Qdisc sch, struct* sk_buff **to_free)
347	{
348	struct sfq_sched_data *q = qdisc_priv(sch);
349	unsigned int hash, dropped;
350	sfq_index x, qlen;
351	struct sfq_slot *slot;
352	int ret;
353	struct sk_buff *head;
354	int delta;
355
356	hash = sfq_classify(skb, sch, qerr: &ret);
357	if (hash == `0`) {
358	if (ret & __NET_XMIT_BYPASS)
359	qdisc_qstats_drop(sch);
360	__qdisc_drop(skb, to_free);
361	return ret;
362	}
363	hash--;
364
365	x = q->ht[hash];
366	slot = &q->slots[x];
367	if (x == SFQ_EMPTY_SLOT) {
368	x = q->dep[`0`].next; / get a free slot /
369	if (x >= SFQ_MAX_FLOWS)
370	return qdisc_drop(skb, sch, to_free);
371	q->ht[hash] = x;
372	slot = &q->slots[x];
373	slot->hash = hash;
374	slot->backlog = `0`; / should already be 0 anyway... /
375	red_set_vars(v: &slot->vars);
376	goto enqueue;
377	}
378	if (q->red_parms) {
379	slot->vars.qavg = red_calc_qavg_no_idle_time(p: q->red_parms,
380	v: &slot->vars,
381	backlog: slot->backlog);
382	switch (red_action(p: q->red_parms,
383	v: &slot->vars,
384	qavg: slot->vars.qavg)) {
385	case RED_DONT_MARK:
386	break;
387
388	case RED_PROB_MARK:
389	qdisc_qstats_overlimit(sch);
390	if (sfq_prob_mark(q)) {
391	/ We know we have at least one packet in queue /
392	if (sfq_headdrop(q) &&
393	INET_ECN_set_ce(skb: slot->skblist_next)) {
394	q->stats.prob_mark_head++;
395	break;
396	}
397	if (INET_ECN_set_ce(skb)) {
398	q->stats.prob_mark++;
399	break;
400	}
401	}
402	q->stats.prob_drop++;
403	goto congestion_drop;
404
405	case RED_HARD_MARK:
406	qdisc_qstats_overlimit(sch);
407	if (sfq_hard_mark(q)) {
408	/ We know we have at least one packet in queue /
409	if (sfq_headdrop(q) &&
410	INET_ECN_set_ce(skb: slot->skblist_next)) {
411	q->stats.forced_mark_head++;
412	break;
413	}
414	if (INET_ECN_set_ce(skb)) {
415	q->stats.forced_mark++;
416	break;
417	}
418	}
419	q->stats.forced_drop++;
420	goto congestion_drop;
421	}
422	}
423
424	if (slot->qlen >= q->maxdepth) {
425	congestion_drop:
426	if (!sfq_headdrop(q))
427	return qdisc_drop(skb, sch, to_free);
428
429	/ We know we have at least one packet in queue /
430	head = slot_dequeue_head(slot);
431	delta = qdisc_pkt_len(skb: head) - qdisc_pkt_len(skb);
432	sch->qstats.backlog -= delta;
433	slot->backlog -= delta;
434	qdisc_drop(skb: head, sch, to_free);
435
436	slot_queue_add(slot, skb);
437	qdisc_tree_reduce_backlog(qdisc: sch, n: `0`, len: delta);
438	return NET_XMIT_CN;
439	}
440
441	enqueue:
442	qdisc_qstats_backlog_inc(sch, skb);
443	slot->backlog += qdisc_pkt_len(skb);
444	slot_queue_add(slot, skb);
445	sfq_inc(q, x);
446	if (slot->qlen == `1`) { / The flow is new /
447	if (q->tail == NULL) { / It is the first flow /
448	slot->next = x;
449	} else {
450	slot->next = q->tail->next;
451	q->tail->next = x;
452	}
453	/ We put this flow at the end of our flow list.*
454	* This might sound unfair for a new flow to wait after old ones,
455	* but we could endup servicing new flows only, and freeze old ones.
456	*/
457	q->tail = slot;
458	/ We could use a bigger initial quantum for new flows /
459	slot->allot = q->scaled_quantum;
460	}
461	if (++sch->q.qlen <= q->limit)
462	return NET_XMIT_SUCCESS;
463
464	qlen = slot->qlen;
465	dropped = sfq_drop(sch, to_free);
466	/ Return Congestion Notification only if we dropped a packet*
467	* from this flow.
468	*/
469	if (qlen != slot->qlen) {
470	qdisc_tree_reduce_backlog(qdisc: sch, n: `0`, len: dropped - qdisc_pkt_len(skb));
471	return NET_XMIT_CN;
472	}
473
474	/ As we dropped a packet, better let upper stack know this /
475	qdisc_tree_reduce_backlog(qdisc: sch, n: `1`, len: dropped);
476	return NET_XMIT_SUCCESS;
477	}
478
479	static struct sk_buff *
480	sfq_dequeue(struct Qdisc *sch)
481	{
482	struct sfq_sched_data *q = qdisc_priv(sch);
483	struct sk_buff *skb;
484	sfq_index a, next_a;
485	struct sfq_slot *slot;
486
487	/ No active slots /
488	if (q->tail == NULL)
489	return NULL;
490
491	next_slot:
492	a = q->tail->next;
493	slot = &q->slots[a];
494	if (slot->allot <= `0`) {
495	q->tail = slot;
496	slot->allot += q->scaled_quantum;
497	goto next_slot;
498	}
499	skb = slot_dequeue_head(slot);
500	sfq_dec(q, x: a);
501	qdisc_bstats_update(sch, skb);
502	sch->q.qlen--;
503	qdisc_qstats_backlog_dec(sch, skb);
504	slot->backlog -= qdisc_pkt_len(skb);
505	/ Is the slot empty? /
506	if (slot->qlen == `0`) {
507	q->ht[slot->hash] = SFQ_EMPTY_SLOT;
508	next_a = slot->next;
509	if (a == next_a) {
510	q->tail = NULL; / no more active slots /
511	return skb;
512	}
513	q->tail->next = next_a;
514	} else {
515	slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
516	}
517	return skb;
518	}
519
520	static void
521	sfq_reset(struct Qdisc *sch)
522	{
523	struct sk_buff *skb;
524
525	while ((skb = sfq_dequeue(sch)) != NULL)
526	rtnl_kfree_skbs(head: skb, tail: skb);
527	}
528
529	/*
530	* When q->perturbation is changed, we rehash all queued skbs
531	* to avoid OOO (Out Of Order) effects.
532	* We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
533	* counters.
534	*/
535	static void sfq_rehash(struct Qdisc *sch)
536	{
537	struct sfq_sched_data *q = qdisc_priv(sch);
538	struct sk_buff *skb;
539	int i;
540	struct sfq_slot *slot;
541	struct sk_buff_head list;
542	int dropped = `0`;
543	unsigned int drop_len = `0`;
544
545	__skb_queue_head_init(list: &list);
546
547	for (i = `0`; i < q->maxflows; i++) {
548	slot = &q->slots[i];
549	if (!slot->qlen)
550	continue;
551	while (slot->qlen) {
552	skb = slot_dequeue_head(slot);
553	sfq_dec(q, x: i);
554	__skb_queue_tail(list: &list, newsk: skb);
555	}
556	slot->backlog = `0`;
557	red_set_vars(v: &slot->vars);
558	q->ht[slot->hash] = SFQ_EMPTY_SLOT;
559	}
560	q->tail = NULL;
561
562	while ((skb = __skb_dequeue(list: &list)) != NULL) {
563	unsigned int hash = sfq_hash(q, skb);
564	sfq_index x = q->ht[hash];
565
566	slot = &q->slots[x];
567	if (x == SFQ_EMPTY_SLOT) {
568	x = q->dep[`0`].next; / get a free slot /
569	if (x >= SFQ_MAX_FLOWS) {
570	drop:
571	qdisc_qstats_backlog_dec(sch, skb);
572	drop_len += qdisc_pkt_len(skb);
573	kfree_skb(skb);
574	dropped++;
575	continue;
576	}
577	q->ht[hash] = x;
578	slot = &q->slots[x];
579	slot->hash = hash;
580	}
581	if (slot->qlen >= q->maxdepth)
582	goto drop;
583	slot_queue_add(slot, skb);
584	if (q->red_parms)
585	slot->vars.qavg = red_calc_qavg(p: q->red_parms,
586	v: &slot->vars,
587	backlog: slot->backlog);
588	slot->backlog += qdisc_pkt_len(skb);
589	sfq_inc(q, x);
590	if (slot->qlen == `1`) { / The flow is new /
591	if (q->tail == NULL) { / It is the first flow /
592	slot->next = x;
593	} else {
594	slot->next = q->tail->next;
595	q->tail->next = x;
596	}
597	q->tail = slot;
598	slot->allot = q->scaled_quantum;
599	}
600	}
601	sch->q.qlen -= dropped;
602	qdisc_tree_reduce_backlog(qdisc: sch, n: dropped, len: drop_len);
603	}
604
605	static void sfq_perturbation(struct timer_list *t)
606	{
607	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
608	struct Qdisc *sch = q->sch;
609	spinlock_t *root_lock;
610	siphash_key_t nkey;
611
612	get_random_bytes(buf: &nkey, len: sizeof(nkey));
613	rcu_read_lock();
614	root_lock = qdisc_lock(qdisc: qdisc_root_sleeping(qdisc: sch));
615	spin_lock(lock: root_lock);
616	q->perturbation = nkey;
617	if (!q->filter_list && q->tail)
618	sfq_rehash(sch);
619	spin_unlock(lock: root_lock);
620
621	if (q->perturb_period)
622	mod_timer(timer: &q->perturb_timer, expires: jiffies + q->perturb_period);
623	rcu_read_unlock();
624	}
625
626	static int sfq_change(struct Qdisc sch, struct* nlattr *opt)
627	{
628	struct sfq_sched_data *q = qdisc_priv(sch);
629	struct tc_sfq_qopt *ctl = nla_data(nla: opt);
630	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
631	unsigned int qlen, dropped = `0`;
632	struct red_parms *p = NULL;
633	struct sk_buff *to_free = NULL;
634	struct sk_buff *tail = NULL;
635
636	if (opt->nla_len < nla_attr_size(payload: sizeof(*ctl)))
637	return -EINVAL;
638	if (opt->nla_len >= nla_attr_size(payload: sizeof(*ctl_v1)))
639	ctl_v1 = nla_data(nla: opt);
640	if (ctl->divisor &&
641	(!is_power_of_2(n: ctl->divisor) \|\| ctl->divisor > `65536`))
642	return -EINVAL;
643
644	/ slot->allot is a short, make sure quantum is not too big. /
645	if (ctl->quantum) {
646	unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum);
647
648	if (scaled <= `0` \|\| scaled > SHRT_MAX)
649	return -EINVAL;
650	}
651
652	if (ctl_v1 && !red_check_params(qth_min: ctl_v1->qth_min, qth_max: ctl_v1->qth_max,
653	Wlog: ctl_v1->Wlog, Scell_log: ctl_v1->Scell_log, NULL))
654	return -EINVAL;
655	if (ctl_v1 && ctl_v1->qth_min) {
656	p = kmalloc(size: sizeof(*p), GFP_KERNEL);
657	if (!p)
658	return -ENOMEM;
659	}
660	sch_tree_lock(q: sch);
661	if (ctl->quantum) {
662	q->quantum = ctl->quantum;
663	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
664	}
665	q->perturb_period = ctl->perturb_period * HZ;
666	if (ctl->flows)
667	q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
668	if (ctl->divisor) {
669	q->divisor = ctl->divisor;
670	q->maxflows = min_t(u32, q->maxflows, q->divisor);
671	}
672	if (ctl_v1) {
673	if (ctl_v1->depth)
674	q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
675	if (p) {
676	swap(q->red_parms, p);
677	red_set_parms(p: q->red_parms,
678	qth_min: ctl_v1->qth_min, qth_max: ctl_v1->qth_max,
679	Wlog: ctl_v1->Wlog,
680	Plog: ctl_v1->Plog, Scell_log: ctl_v1->Scell_log,
681	NULL,
682	max_P: ctl_v1->max_P);
683	}
684	q->flags = ctl_v1->flags;
685	q->headdrop = ctl_v1->headdrop;
686	}
687	if (ctl->limit) {
688	q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
689	q->maxflows = min_t(u32, q->maxflows, q->limit);
690	}
691
692	qlen = sch->q.qlen;
693	while (sch->q.qlen > q->limit) {
694	dropped += sfq_drop(sch, to_free: &to_free);
695	if (!tail)
696	tail = to_free;
697	}
698
699	rtnl_kfree_skbs(head: to_free, tail);
700	qdisc_tree_reduce_backlog(qdisc: sch, n: qlen - sch->q.qlen, len: dropped);
701
702	del_timer(timer: &q->perturb_timer);
703	if (q->perturb_period) {
704	mod_timer(timer: &q->perturb_timer, expires: jiffies + q->perturb_period);
705	get_random_bytes(buf: &q->perturbation, len: sizeof(q->perturbation));
706	}
707	sch_tree_unlock(q: sch);
708	kfree(objp: p);
709	return `0`;
710	}
711
712	static void *sfq_alloc(size_t sz)
713	{
714	return kvmalloc(size: sz, GFP_KERNEL);
715	}
716
717	static void sfq_free(void *addr)
718	{
719	kvfree(addr);
720	}
721
722	static void sfq_destroy(struct Qdisc *sch)
723	{
724	struct sfq_sched_data *q = qdisc_priv(sch);
725
726	tcf_block_put(block: q->block);
727	q->perturb_period = `0`;
728	del_timer_sync(timer: &q->perturb_timer);
729	sfq_free(addr: q->ht);
730	sfq_free(addr: q->slots);
731	kfree(objp: q->red_parms);
732	}
733
734	static int sfq_init(struct Qdisc sch, struct* nlattr *opt,
735	struct netlink_ext_ack *extack)
736	{
737	struct sfq_sched_data *q = qdisc_priv(sch);
738	int i;
739	int err;
740
741	q->sch = sch;
742	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
743
744	err = tcf_block_get(p_block: &q->block, p_filter_chain: &q->filter_list, q: sch, extack);
745	if (err)
746	return err;
747
748	for (i = `0`; i < SFQ_MAX_DEPTH + `1`; i++) {
749	q->dep[i].next = i + SFQ_MAX_FLOWS;
750	q->dep[i].prev = i + SFQ_MAX_FLOWS;
751	}
752
753	q->limit = SFQ_MAX_DEPTH;
754	q->maxdepth = SFQ_MAX_DEPTH;
755	q->cur_depth = `0`;
756	q->tail = NULL;
757	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
758	q->maxflows = SFQ_DEFAULT_FLOWS;
759	q->quantum = psched_mtu(dev: qdisc_dev(qdisc: sch));
760	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
761	q->perturb_period = `0`;
762	get_random_bytes(buf: &q->perturbation, len: sizeof(q->perturbation));
763
764	if (opt) {
765	int err = sfq_change(sch, opt);
766	if (err)
767	return err;
768	}
769
770	q->ht = sfq_alloc(sz: sizeof(q->ht[`0`]) * q->divisor);
771	q->slots = sfq_alloc(sz: sizeof(q->slots[`0`]) * q->maxflows);
772	if (!q->ht \|\| !q->slots) {
773	/ Note: sfq_destroy() will be called by our caller /
774	return -ENOMEM;
775	}
776
777	for (i = `0`; i < q->divisor; i++)
778	q->ht[i] = SFQ_EMPTY_SLOT;
779
780	for (i = `0`; i < q->maxflows; i++) {
781	slot_queue_init(slot: &q->slots[i]);
782	sfq_link(q, x: i);
783	}
784	if (q->limit >= `1`)
785	sch->flags \|= TCQ_F_CAN_BYPASS;
786	else
787	sch->flags &= ~TCQ_F_CAN_BYPASS;
788	return `0`;
789	}
790
791	static int sfq_dump(struct Qdisc sch, struct* sk_buff *skb)
792	{
793	struct sfq_sched_data *q = qdisc_priv(sch);
794	unsigned char *b = skb_tail_pointer(skb);
795	struct tc_sfq_qopt_v1 opt;
796	struct red_parms *p = q->red_parms;
797
798	memset(&opt, `0`, sizeof(opt));
799	opt.v0.quantum = q->quantum;
800	opt.v0.perturb_period = q->perturb_period / HZ;
801	opt.v0.limit = q->limit;
802	opt.v0.divisor = q->divisor;
803	opt.v0.flows = q->maxflows;
804	opt.depth = q->maxdepth;
805	opt.headdrop = q->headdrop;
806
807	if (p) {
808	opt.qth_min = p->qth_min >> p->Wlog;
809	opt.qth_max = p->qth_max >> p->Wlog;
810	opt.Wlog = p->Wlog;
811	opt.Plog = p->Plog;
812	opt.Scell_log = p->Scell_log;
813	opt.max_P = p->max_P;
814	}
815	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
816	opt.flags = q->flags;
817
818	if (nla_put(skb, attrtype: TCA_OPTIONS, attrlen: sizeof(opt), data: &opt))
819	goto nla_put_failure;
820
821	return skb->len;
822
823	nla_put_failure:
824	nlmsg_trim(skb, mark: b);
825	return -`1`;
826	}
827
828	static struct Qdisc sfq_leaf(struct* Qdisc sch, unsigned* long arg)
829	{
830	return NULL;
831	}
832
833	static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
834	{
835	return `0`;
836	}
837
838	static unsigned long sfq_bind(struct Qdisc sch, unsigned* long parent,
839	u32 classid)
840	{
841	return `0`;
842	}
843
844	static void sfq_unbind(struct Qdisc q, unsigned* long cl)
845	{
846	}
847
848	static struct tcf_block sfq_tcf_block(struct* Qdisc sch, unsigned* long cl,
849	struct netlink_ext_ack *extack)
850	{
851	struct sfq_sched_data *q = qdisc_priv(sch);
852
853	if (cl)
854	return NULL;
855	return q->block;
856	}
857
858	static int sfq_dump_class(struct Qdisc sch, unsigned* long cl,
859	struct sk_buff skb, struct* tcmsg *tcm)
860	{
861	tcm->tcm_handle \|= TC_H_MIN(cl);
862	return `0`;
863	}
864
865	static int sfq_dump_class_stats(struct Qdisc sch, unsigned* long cl,
866	struct gnet_dump *d)
867	{
868	struct sfq_sched_data *q = qdisc_priv(sch);
869	sfq_index idx = q->ht[cl - `1`];
870	struct gnet_stats_queue qs = { `0` };
871	struct tc_sfq_xstats xstats = { `0` };
872
873	if (idx != SFQ_EMPTY_SLOT) {
874	const struct sfq_slot *slot = &q->slots[idx];
875
876	xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
877	qs.qlen = slot->qlen;
878	qs.backlog = slot->backlog;
879	}
880	if (gnet_stats_copy_queue(d, NULL, q: &qs, qlen: qs.qlen) < `0`)
881	return -`1`;
882	return gnet_stats_copy_app(d, st: &xstats, len: sizeof(xstats));
883	}
884
885	static void sfq_walk(struct Qdisc sch, struct* qdisc_walker *arg)
886	{
887	struct sfq_sched_data *q = qdisc_priv(sch);
888	unsigned int i;
889
890	if (arg->stop)
891	return;
892
893	for (i = `0`; i < q->divisor; i++) {
894	if (q->ht[i] == SFQ_EMPTY_SLOT) {
895	arg->count++;
896	continue;
897	}
898	if (!tc_qdisc_stats_dump(sch, cl: i + `1`, arg))
899	break;
900	}
901	}
902
903	static const struct Qdisc_class_ops sfq_class_ops = {
904	.leaf = sfq_leaf,
905	.find = sfq_find,
906	.tcf_block = sfq_tcf_block,
907	.bind_tcf = sfq_bind,
908	.unbind_tcf = sfq_unbind,
909	.dump = sfq_dump_class,
910	.dump_stats = sfq_dump_class_stats,
911	.walk = sfq_walk,
912	};
913
914	static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
915	.cl_ops = &sfq_class_ops,
916	.id = "sfq",
917	.priv_size = sizeof(struct sfq_sched_data),
918	.enqueue = sfq_enqueue,
919	.dequeue = sfq_dequeue,
920	.peek = qdisc_peek_dequeued,
921	.init = sfq_init,
922	.reset = sfq_reset,
923	.destroy = sfq_destroy,
924	.change = NULL,
925	.dump = sfq_dump,
926	.owner = THIS_MODULE,
927	};
928
929	static int __init sfq_module_init(void)
930	{
931	return register_qdisc(qops: &sfq_qdisc_ops);
932	}
933	static void __exit sfq_module_exit(void)
934	{
935	unregister_qdisc(qops: &sfq_qdisc_ops);
936	}
937	module_init(sfq_module_init)
938	module_exit(sfq_module_exit)
939	MODULE_LICENSE("GPL");
940

source code of linux/net/sched/sch_sfq.c