sch_qfq.c source code [linux/net/sched/sch_qfq.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
4	*
5	* Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
6	* Copyright (c) 2012 Paolo Valente.
7	*/
8
9	#include <linux/module.h>
10	#include <linux/init.h>
11	#include <linux/bitops.h>
12	#include <linux/errno.h>
13	#include <linux/netdevice.h>
14	#include <linux/pkt_sched.h>
15	#include <net/sch_generic.h>
16	#include <net/pkt_sched.h>
17	#include <net/pkt_cls.h>
18
19
20	/ Quick Fair Queueing Plus*
21	========================
22
23	Sources:
24
25	[1] Paolo Valente,
26	"Reducing the Execution Time of Fair-Queueing Schedulers."
27	http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
28
29	Sources for QFQ:
30
31	[2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
32	Packet Scheduling with Tight Bandwidth Distribution Guarantees."
33
34	See also:
35	http://retis.sssup.it/~fabio/linux/qfq/
36	*/
37
38	/*
39
40	QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
41	classes. Each aggregate is timestamped with a virtual start time S
42	and a virtual finish time F, and scheduled according to its
43	timestamps. S and F are computed as a function of a system virtual
44	time function V. The classes within each aggregate are instead
45	scheduled with DRR.
46
47	To speed up operations, QFQ+ divides also aggregates into a limited
48	number of groups. Which group a class belongs to depends on the
49	ratio between the maximum packet length for the class and the weight
50	of the class. Groups have their own S and F. In the end, QFQ+
51	schedules groups, then aggregates within groups, then classes within
52	aggregates. See [1] and [2] for a full description.
53
54	Virtual time computations.
55
56	S, F and V are all computed in fixed point arithmetic with
57	FRAC_BITS decimal bits.
58
59	QFQ_MAX_INDEX is the maximum index allowed for a group. We need
60	one bit per index.
61	QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
62
63	The layout of the bits is as below:
64
65	[ MTU_SHIFT ][ FRAC_BITS ]
66	[ MAX_INDEX ][ MIN_SLOT_SHIFT ]
67	^.__grp->index = 0
68	*.__grp->slot_shift
69
70	where MIN_SLOT_SHIFT is derived by difference from the others.
71
72	The max group index corresponds to Lmax/w_min, where
73	Lmax=1<<MTU_SHIFT, w_min = 1 .
74	From this, and knowing how many groups (MAX_INDEX) we want,
75	we can derive the shift corresponding to each group.
76
77	Because we often need to compute
78	F = S + len/w_i and V = V + len/wsum
79	instead of storing w_i store the value
80	inv_w = (1<<FRAC_BITS)/w_i
81	so we can do F = S + len inv_w * wsum.*
82	We use W_TOT in the formulas so we can easily move between
83	static and adaptive weight sum.
84
85	The per-scheduler-instance data contain all the data structures
86	for the scheduler: bitmaps and bucket lists.
87
88	*/
89
90	/*
91	* Maximum number of consecutive slots occupied by backlogged classes
92	* inside a group.
93	*/
94	#define QFQ_MAX_SLOTS 32
95
96	/*
97	* Shifts used for aggregate<->group mapping. We allow class weights that are
98	* in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
99	* group with the smallest index that can support the L_i / r_i configured
100	* for the classes in the aggregate.
101	*
102	* grp->index is the index of the group; and grp->slot_shift
103	* is the shift for the corresponding (scaled) sigma_i.
104	*/
105	#define QFQ_MAX_INDEX 24
106	#define QFQ_MAX_WSHIFT 10
107
108	#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
109	#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
110
111	#define FRAC_BITS 30 /* fixed point arithmetic */
112	#define ONE_FP (1UL << FRAC_BITS)
113
114	#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
115	#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
116	#define QFQ_MAX_LMAX (1UL << QFQ_MTU_SHIFT)
117
118	#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
119
120	/*
121	* Possible group states. These values are used as indexes for the bitmaps
122	* array of struct qfq_queue.
123	*/
124	enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
125
126	struct qfq_group;
127
128	struct qfq_aggregate;
129
130	struct qfq_class {
131	struct Qdisc_class_common common;
132
133	struct gnet_stats_basic_sync bstats;
134	struct gnet_stats_queue qstats;
135	struct net_rate_estimator __rcu *rate_est;
136	struct Qdisc *qdisc;
137	struct list_head alist; / Link for active-classes list. /
138	struct qfq_aggregate agg; /* Parent aggregate. /
139	int deficit; / DRR deficit counter. /
140	};
141
142	struct qfq_aggregate {
143	struct hlist_node next; / Link for the slot list. /
144	u64 S, F; / flow timestamps (exact) /
145
146	/ group we belong to. In principle we would need the index,*
147	* which is log_2(lmax/weight), but we never reference it
148	* directly, only the group.
149	*/
150	struct qfq_group *grp;
151
152	/ these are copied from the flowset. /
153	u32 class_weight; / Weight of each class in this aggregate. /
154	/ Max pkt size for the classes in this aggregate, DRR quantum. /
155	int lmax;
156
157	u32 inv_w; / ONE_FP/(sum of weights of classes in aggr.). /
158	u32 budgetmax; / Max budget for this aggregate. /
159	u32 initial_budget, budget; / Initial and current budget. /
160
161	int num_classes; / Number of classes in this aggr. /
162	struct list_head active; / DRR queue of active classes. /
163
164	struct hlist_node nonfull_next; / See nonfull_aggs in qfq_sched. /
165	};
166
167	struct qfq_group {
168	u64 S, F; / group timestamps (approx). /
169	unsigned int slot_shift; / Slot shift. /
170	unsigned int index; / Group index. /
171	unsigned int front; / Index of the front slot. /
172	unsigned long full_slots; / non-empty slots /
173
174	/ Array of RR lists of active aggregates. /
175	struct hlist_head slots[QFQ_MAX_SLOTS];
176	};
177
178	struct qfq_sched {
179	struct tcf_proto __rcu *filter_list;
180	struct tcf_block *block;
181	struct Qdisc_class_hash clhash;
182
183	u64 oldV, V; / Precise virtual times. /
184	struct qfq_aggregate in_serv_agg; /* Aggregate being served. /
185	u32 wsum; / weight sum /
186	u32 iwsum; / inverse weight sum /
187
188	unsigned long bitmaps[QFQ_MAX_STATE]; / Group bitmaps. /
189	struct qfq_group groups[QFQ_MAX_INDEX + `1`]; / The groups. /
190	u32 min_slot_shift; / Index of the group-0 bit in the bitmaps. /
191
192	u32 max_agg_classes; / Max number of classes per aggr. /
193	struct hlist_head nonfull_aggs; / Aggs with room for more classes. /
194	};
195
196	/*
197	* Possible reasons why the timestamps of an aggregate are updated
198	* enqueue: the aggregate switches from idle to active and must scheduled
199	* for service
200	* requeue: the aggregate finishes its budget, so it stops being served and
201	* must be rescheduled for service
202	*/
203	enum update_reason {enqueue, requeue};
204
205	static struct qfq_class qfq_find_class(struct* Qdisc *sch, u32 classid)
206	{
207	struct qfq_sched *q = qdisc_priv(sch);
208	struct Qdisc_class_common *clc;
209
210	clc = qdisc_class_find(hash: &q->clhash, id: classid);
211	if (clc == NULL)
212	return NULL;
213	return container_of(clc, struct qfq_class, common);
214	}
215
216	static const struct netlink_range_validation lmax_range = {
217	.min = QFQ_MIN_LMAX,
218	.max = QFQ_MAX_LMAX,
219	};
220
221	static const struct nla_policy qfq_policy[TCA_QFQ_MAX + `1`] = {
222	[TCA_QFQ_WEIGHT] = NLA_POLICY_RANGE(NLA_U32, `1`, QFQ_MAX_WEIGHT),
223	[TCA_QFQ_LMAX] = NLA_POLICY_FULL_RANGE(NLA_U32, &lmax_range),
224	};
225
226	/*
227	* Calculate a flow index, given its weight and maximum packet length.
228	* index = log_2(maxlen/weight) but we need to apply the scaling.
229	* This is used only once at flow creation.
230	*/
231	static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
232	{
233	u64 slot_size = (u64)maxlen * inv_w;
234	unsigned long size_map;
235	int index = `0`;
236
237	size_map = slot_size >> min_slot_shift;
238	if (!size_map)
239	goto out;
240
241	index = __fls(word: size_map) + `1`; / basically a log_2 /
242	index -= !(slot_size - (`1ULL` << (index + min_slot_shift - `1`)));
243
244	if (index < `0`)
245	index = `0`;
246	out:
247	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
248	(unsigned long) ONE_FP/inv_w, maxlen, index);
249
250	return index;
251	}
252
253	static void qfq_deactivate_agg(struct qfq_sched , struct* qfq_aggregate *);
254	static void qfq_activate_agg(struct qfq_sched , struct* qfq_aggregate *,
255	enum update_reason);
256
257	static void qfq_init_agg(struct qfq_sched q, struct* qfq_aggregate *agg,
258	u32 lmax, u32 weight)
259	{
260	INIT_LIST_HEAD(list: &agg->active);
261	hlist_add_head(n: &agg->nonfull_next, h: &q->nonfull_aggs);
262
263	agg->lmax = lmax;
264	agg->class_weight = weight;
265	}
266
267	static struct qfq_aggregate qfq_find_agg(struct* qfq_sched *q,
268	u32 lmax, u32 weight)
269	{
270	struct qfq_aggregate *agg;
271
272	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
273	if (agg->lmax == lmax && agg->class_weight == weight)
274	return agg;
275
276	return NULL;
277	}
278
279
280	/ Update aggregate as a function of the new number of classes. /
281	static void qfq_update_agg(struct qfq_sched q, struct* qfq_aggregate *agg,
282	int new_num_classes)
283	{
284	u32 new_agg_weight;
285
286	if (new_num_classes == q->max_agg_classes)
287	hlist_del_init(n: &agg->nonfull_next);
288
289	if (agg->num_classes > new_num_classes &&
290	new_num_classes == q->max_agg_classes - `1`) / agg no more full /
291	hlist_add_head(n: &agg->nonfull_next, h: &q->nonfull_aggs);
292
293	/ The next assignment may let*
294	* agg->initial_budget > agg->budgetmax
295	* hold, we will take it into account in charge_actual_service().
296	*/
297	agg->budgetmax = new_num_classes * agg->lmax;
298	new_agg_weight = agg->class_weight * new_num_classes;
299	agg->inv_w = ONE_FP/new_agg_weight;
300
301	if (agg->grp == NULL) {
302	int i = qfq_calc_index(inv_w: agg->inv_w, maxlen: agg->budgetmax,
303	min_slot_shift: q->min_slot_shift);
304	agg->grp = &q->groups[i];
305	}
306
307	q->wsum +=
308	(int) agg->class_weight * (new_num_classes - agg->num_classes);
309	q->iwsum = ONE_FP / q->wsum;
310
311	agg->num_classes = new_num_classes;
312	}
313
314	/ Add class to aggregate. /
315	static void qfq_add_to_agg(struct qfq_sched *q,
316	struct qfq_aggregate *agg,
317	struct qfq_class *cl)
318	{
319	cl->agg = agg;
320
321	qfq_update_agg(q, agg, new_num_classes: agg->num_classes+`1`);
322	if (cl->qdisc->q.qlen > `0`) { / adding an active class /
323	list_add_tail(new: &cl->alist, head: &agg->active);
324	if (list_first_entry(&agg->active, struct qfq_class, alist) ==
325	cl && q->in_serv_agg != agg) / agg was inactive /
326	qfq_activate_agg(q, agg, enqueue); / schedule agg /
327	}
328	}
329
330	static struct qfq_aggregate qfq_choose_next_agg(struct* qfq_sched *);
331
332	static void qfq_destroy_agg(struct qfq_sched q, struct* qfq_aggregate *agg)
333	{
334	hlist_del_init(n: &agg->nonfull_next);
335	q->wsum -= agg->class_weight;
336	if (q->wsum != `0`)
337	q->iwsum = ONE_FP / q->wsum;
338
339	if (q->in_serv_agg == agg)
340	q->in_serv_agg = qfq_choose_next_agg(q);
341	kfree(objp: agg);
342	}
343
344	/ Deschedule class from within its parent aggregate. /
345	static void qfq_deactivate_class(struct qfq_sched q, struct* qfq_class *cl)
346	{
347	struct qfq_aggregate *agg = cl->agg;
348
349
350	list_del(entry: &cl->alist); / remove from RR queue of the aggregate /
351	if (list_empty(head: &agg->active)) / agg is now inactive /
352	qfq_deactivate_agg(q, agg);
353	}
354
355	/ Remove class from its parent aggregate. /
356	static void qfq_rm_from_agg(struct qfq_sched q, struct* qfq_class *cl)
357	{
358	struct qfq_aggregate *agg = cl->agg;
359
360	cl->agg = NULL;
361	if (agg->num_classes == `1`) { / agg being emptied, destroy it /
362	qfq_destroy_agg(q, agg);
363	return;
364	}
365	qfq_update_agg(q, agg, new_num_classes: agg->num_classes-`1`);
366	}
367
368	/ Deschedule class and remove it from its parent aggregate. /
369	static void qfq_deact_rm_from_agg(struct qfq_sched q, struct* qfq_class *cl)
370	{
371	if (cl->qdisc->q.qlen > `0`) / class is active /
372	qfq_deactivate_class(q, cl);
373
374	qfq_rm_from_agg(q, cl);
375	}
376
377	/ Move class to a new aggregate, matching the new class weight and/or lmax /
378	static int qfq_change_agg(struct Qdisc sch, struct* qfq_class *cl, u32 weight,
379	u32 lmax)
380	{
381	struct qfq_sched *q = qdisc_priv(sch);
382	struct qfq_aggregate *new_agg;
383
384	/ 'lmax' can range from [QFQ_MIN_LMAX, pktlen + stab overhead] /
385	if (lmax > QFQ_MAX_LMAX)
386	return -EINVAL;
387
388	new_agg = qfq_find_agg(q, lmax, weight);
389	if (new_agg == NULL) { / create new aggregate /
390	new_agg = kzalloc(size: sizeof(*new_agg), GFP_ATOMIC);
391	if (new_agg == NULL)
392	return -ENOBUFS;
393	qfq_init_agg(q, agg: new_agg, lmax, weight);
394	}
395	qfq_deact_rm_from_agg(q, cl);
396	qfq_add_to_agg(q, agg: new_agg, cl);
397
398	return `0`;
399	}
400
401	static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
402	struct nlattr *tca, unsigned* long *arg,
403	struct netlink_ext_ack *extack)
404	{
405	struct qfq_sched *q = qdisc_priv(sch);
406	struct qfq_class cl = (struct* qfq_class )arg;
407	bool existing = false;
408	struct nlattr *tb[TCA_QFQ_MAX + `1`];
409	struct qfq_aggregate *new_agg = NULL;
410	u32 weight, lmax, inv_w;
411	int err;
412	int delta_w;
413
414	if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) {
415	NL_SET_ERR_MSG_MOD(extack, "missing options");
416	return -EINVAL;
417	}
418
419	err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, nla: tca[TCA_OPTIONS],
420	policy: qfq_policy, extack);
421	if (err < `0`)
422	return err;
423
424	if (tb[TCA_QFQ_WEIGHT])
425	weight = nla_get_u32(nla: tb[TCA_QFQ_WEIGHT]);
426	else
427	weight = `1`;
428
429	if (tb[TCA_QFQ_LMAX]) {
430	lmax = nla_get_u32(nla: tb[TCA_QFQ_LMAX]);
431	} else {
432	/ MTU size is user controlled /
433	lmax = psched_mtu(dev: qdisc_dev(qdisc: sch));
434	if (lmax < QFQ_MIN_LMAX \|\| lmax > QFQ_MAX_LMAX) {
435	NL_SET_ERR_MSG_MOD(extack,
436	"MTU size out of bounds for qfq");
437	return -EINVAL;
438	}
439	}
440
441	inv_w = ONE_FP / weight;
442	weight = ONE_FP / inv_w;
443
444	if (cl != NULL &&
445	lmax == cl->agg->lmax &&
446	weight == cl->agg->class_weight)
447	return `0`; / nothing to change /
448
449	delta_w = weight - (cl ? cl->agg->class_weight : `0`);
450
451	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
452	NL_SET_ERR_MSG_FMT_MOD(extack,
453	"total weight out of range (%d + %u)\n",
454	delta_w, q->wsum);
455	return -EINVAL;
456	}
457
458	if (cl != NULL) { / modify existing class /
459	if (tca[TCA_RATE]) {
460	err = gen_replace_estimator(bstats: &cl->bstats, NULL,
461	ptr: &cl->rate_est,
462	NULL,
463	running: true,
464	opt: tca[TCA_RATE]);
465	if (err)
466	return err;
467	}
468	existing = true;
469	goto set_change_agg;
470	}
471
472	/ create and init new class /
473	cl = kzalloc(size: sizeof(struct qfq_class), GFP_KERNEL);
474	if (cl == NULL)
475	return -ENOBUFS;
476
477	gnet_stats_basic_sync_init(b: &cl->bstats);
478	cl->common.classid = classid;
479	cl->deficit = lmax;
480
481	cl->qdisc = qdisc_create_dflt(dev_queue: sch->dev_queue, ops: &pfifo_qdisc_ops,
482	parentid: classid, NULL);
483	if (cl->qdisc == NULL)
484	cl->qdisc = &noop_qdisc;
485
486	if (tca[TCA_RATE]) {
487	err = gen_new_estimator(bstats: &cl->bstats, NULL,
488	rate_est: &cl->rate_est,
489	NULL,
490	running: true,
491	opt: tca[TCA_RATE]);
492	if (err)
493	goto destroy_class;
494	}
495
496	if (cl->qdisc != &noop_qdisc)
497	qdisc_hash_add(q: cl->qdisc, invisible: true);
498
499	set_change_agg:
500	sch_tree_lock(q: sch);
501	new_agg = qfq_find_agg(q, lmax, weight);
502	if (new_agg == NULL) { / create new aggregate /
503	sch_tree_unlock(q: sch);
504	new_agg = kzalloc(size: sizeof(*new_agg), GFP_KERNEL);
505	if (new_agg == NULL) {
506	err = -ENOBUFS;
507	gen_kill_estimator(ptr: &cl->rate_est);
508	goto destroy_class;
509	}
510	sch_tree_lock(q: sch);
511	qfq_init_agg(q, agg: new_agg, lmax, weight);
512	}
513	if (existing)
514	qfq_deact_rm_from_agg(q, cl);
515	else
516	qdisc_class_hash_insert(&q->clhash, &cl->common);
517	qfq_add_to_agg(q, agg: new_agg, cl);
518	sch_tree_unlock(q: sch);
519	qdisc_class_hash_grow(sch, &q->clhash);
520
521	arg = (unsigned* long)cl;
522	return `0`;
523
524	destroy_class:
525	qdisc_put(qdisc: cl->qdisc);
526	kfree(objp: cl);
527	return err;
528	}
529
530	static void qfq_destroy_class(struct Qdisc sch, struct* qfq_class *cl)
531	{
532	struct qfq_sched *q = qdisc_priv(sch);
533
534	qfq_rm_from_agg(q, cl);
535	gen_kill_estimator(ptr: &cl->rate_est);
536	qdisc_put(qdisc: cl->qdisc);
537	kfree(objp: cl);
538	}
539
540	static int qfq_delete_class(struct Qdisc sch, unsigned* long arg,
541	struct netlink_ext_ack *extack)
542	{
543	struct qfq_sched *q = qdisc_priv(sch);
544	struct qfq_class cl = (struct* qfq_class *)arg;
545
546	if (qdisc_class_in_use(cl: &cl->common)) {
547	NL_SET_ERR_MSG_MOD(extack, "QFQ class in use");
548	return -EBUSY;
549	}
550
551	sch_tree_lock(q: sch);
552
553	qdisc_purge_queue(sch: cl->qdisc);
554	qdisc_class_hash_remove(&q->clhash, &cl->common);
555
556	sch_tree_unlock(q: sch);
557
558	qfq_destroy_class(sch, cl);
559	return `0`;
560	}
561
562	static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid)
563	{
564	return (unsigned long)qfq_find_class(sch, classid);
565	}
566
567	static struct tcf_block qfq_tcf_block(struct* Qdisc sch, unsigned* long cl,
568	struct netlink_ext_ack *extack)
569	{
570	struct qfq_sched *q = qdisc_priv(sch);
571
572	if (cl)
573	return NULL;
574
575	return q->block;
576	}
577
578	static unsigned long qfq_bind_tcf(struct Qdisc sch, unsigned* long parent,
579	u32 classid)
580	{
581	struct qfq_class *cl = qfq_find_class(sch, classid);
582
583	if (cl)
584	qdisc_class_get(cl: &cl->common);
585
586	return (unsigned long)cl;
587	}
588
589	static void qfq_unbind_tcf(struct Qdisc sch, unsigned* long arg)
590	{
591	struct qfq_class cl = (struct* qfq_class *)arg;
592
593	qdisc_class_put(cl: &cl->common);
594	}
595
596	static int qfq_graft_class(struct Qdisc sch, unsigned* long arg,
597	struct Qdisc new, struct* Qdisc **old,
598	struct netlink_ext_ack *extack)
599	{
600	struct qfq_class cl = (struct* qfq_class *)arg;
601
602	if (new == NULL) {
603	new = qdisc_create_dflt(dev_queue: sch->dev_queue, ops: &pfifo_qdisc_ops,
604	parentid: cl->common.classid, NULL);
605	if (new == NULL)
606	new = &noop_qdisc;
607	}
608
609	*old = qdisc_replace(sch, new, pold: &cl->qdisc);
610	return `0`;
611	}
612
613	static struct Qdisc qfq_class_leaf(struct* Qdisc sch, unsigned* long arg)
614	{
615	struct qfq_class cl = (struct* qfq_class *)arg;
616
617	return cl->qdisc;
618	}
619
620	static int qfq_dump_class(struct Qdisc sch, unsigned* long arg,
621	struct sk_buff skb, struct* tcmsg *tcm)
622	{
623	struct qfq_class cl = (struct* qfq_class *)arg;
624	struct nlattr *nest;
625
626	tcm->tcm_parent = TC_H_ROOT;
627	tcm->tcm_handle = cl->common.classid;
628	tcm->tcm_info = cl->qdisc->handle;
629
630	nest = nla_nest_start_noflag(skb, attrtype: TCA_OPTIONS);
631	if (nest == NULL)
632	goto nla_put_failure;
633	if (nla_put_u32(skb, attrtype: TCA_QFQ_WEIGHT, value: cl->agg->class_weight) \|\|
634	nla_put_u32(skb, attrtype: TCA_QFQ_LMAX, value: cl->agg->lmax))
635	goto nla_put_failure;
636	return nla_nest_end(skb, start: nest);
637
638	nla_put_failure:
639	nla_nest_cancel(skb, start: nest);
640	return -EMSGSIZE;
641	}
642
643	static int qfq_dump_class_stats(struct Qdisc sch, unsigned* long arg,
644	struct gnet_dump *d)
645	{
646	struct qfq_class cl = (struct* qfq_class *)arg;
647	struct tc_qfq_stats xstats;
648
649	memset(&xstats, `0`, sizeof(xstats));
650
651	xstats.weight = cl->agg->class_weight;
652	xstats.lmax = cl->agg->lmax;
653
654	if (gnet_stats_copy_basic(d, NULL, b: &cl->bstats, running: true) < `0` \|\|
655	gnet_stats_copy_rate_est(d, ptr: &cl->rate_est) < `0` \|\|
656	qdisc_qstats_copy(d, sch: cl->qdisc) < `0`)
657	return -`1`;
658
659	return gnet_stats_copy_app(d, st: &xstats, len: sizeof(xstats));
660	}
661
662	static void qfq_walk(struct Qdisc sch, struct* qdisc_walker *arg)
663	{
664	struct qfq_sched *q = qdisc_priv(sch);
665	struct qfq_class *cl;
666	unsigned int i;
667
668	if (arg->stop)
669	return;
670
671	for (i = `0`; i < q->clhash.hashsize; i++) {
672	hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
673	if (!tc_qdisc_stats_dump(sch, cl: (unsigned long)cl, arg))
674	return;
675	}
676	}
677	}
678
679	static struct qfq_class qfq_classify(struct* sk_buff skb, struct* Qdisc *sch,
680	int *qerr)
681	{
682	struct qfq_sched *q = qdisc_priv(sch);
683	struct qfq_class *cl;
684	struct tcf_result res;
685	struct tcf_proto *fl;
686	int result;
687
688	if (TC_H_MAJ(skb->priority ^ sch->handle) == `0`) {
689	pr_debug("qfq_classify: found %d\n", skb->priority);
690	cl = qfq_find_class(sch, classid: skb->priority);
691	if (cl != NULL)
692	return cl;
693	}
694
695	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
696	fl = rcu_dereference_bh(q->filter_list);
697	result = tcf_classify(skb, NULL, tp: fl, res: &res, compat_mode: false);
698	if (result >= `0`) {
699	#ifdef CONFIG_NET_CLS_ACT
700	switch (result) {
701	case TC_ACT_QUEUED:
702	case TC_ACT_STOLEN:
703	case TC_ACT_TRAP:
704	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_STOLEN;
705	fallthrough;
706	case TC_ACT_SHOT:
707	return NULL;
708	}
709	#endif
710	cl = (struct qfq_class *)res.class;
711	if (cl == NULL)
712	cl = qfq_find_class(sch, classid: res.classid);
713	return cl;
714	}
715
716	return NULL;
717	}
718
719	/ Generic comparison function, handling wraparound. /
720	static inline int qfq_gt(u64 a, u64 b)
721	{
722	return (s64)(a - b) > `0`;
723	}
724
725	/ Round a precise timestamp to its slotted value. /
726	static inline u64 qfq_round_down(u64 ts, unsigned int shift)
727	{
728	return ts & ~((`1ULL` << shift) - `1`);
729	}
730
731	/ return the pointer to the group with lowest index in the bitmap /
732	static inline struct qfq_group qfq_ffs(struct* qfq_sched *q,
733	unsigned long bitmap)
734	{
735	int index = __ffs(bitmap);
736	return &q->groups[index];
737	}
738	/ Calculate a mask to mimic what would be ffs_from(). /
739	static inline unsigned long mask_from(unsigned long bitmap, int from)
740	{
741	return bitmap & ~((`1UL` << from) - `1`);
742	}
743
744	/*
745	* The state computation relies on ER=0, IR=1, EB=2, IB=3
746	* First compute eligibility comparing grp->S, q->V,
747	* then check if someone is blocking us and possibly add EB
748	*/
749	static int qfq_calc_state(struct qfq_sched q, const* struct qfq_group *grp)
750	{
751	/ if S > V we are not eligible /
752	unsigned int state = qfq_gt(a: grp->S, b: q->V);
753	unsigned long mask = mask_from(bitmap: q->bitmaps[ER], from: grp->index);
754	struct qfq_group *next;
755
756	if (mask) {
757	next = qfq_ffs(q, bitmap: mask);
758	if (qfq_gt(a: grp->F, b: next->F))
759	state \|= EB;
760	}
761
762	return state;
763	}
764
765
766	/*
767	* In principle
768	* q->bitmaps[dst] \|= q->bitmaps[src] & mask;
769	* q->bitmaps[src] &= ~mask;
770	* but we should make sure that src != dst
771	*/
772	static inline void qfq_move_groups(struct qfq_sched q, unsigned* long mask,
773	int src, int dst)
774	{
775	q->bitmaps[dst] \|= q->bitmaps[src] & mask;
776	q->bitmaps[src] &= ~mask;
777	}
778
779	static void qfq_unblock_groups(struct qfq_sched q, int* index, u64 old_F)
780	{
781	unsigned long mask = mask_from(bitmap: q->bitmaps[ER], from: index + `1`);
782	struct qfq_group *next;
783
784	if (mask) {
785	next = qfq_ffs(q, bitmap: mask);
786	if (!qfq_gt(a: next->F, b: old_F))
787	return;
788	}
789
790	mask = (`1UL` << index) - `1`;
791	qfq_move_groups(q, mask, src: EB, dst: ER);
792	qfq_move_groups(q, mask, src: IB, dst: IR);
793	}
794
795	/*
796	* perhaps
797	*
798	old_V ^= q->V;
799	old_V >>= q->min_slot_shift;
800	if (old_V) {
801	...
802	}
803	*
804	*/
805	static void qfq_make_eligible(struct qfq_sched *q)
806	{
807	unsigned long vslot = q->V >> q->min_slot_shift;
808	unsigned long old_vslot = q->oldV >> q->min_slot_shift;
809
810	if (vslot != old_vslot) {
811	unsigned long mask;
812	int last_flip_pos = fls(x: vslot ^ old_vslot);
813
814	if (last_flip_pos > `31`) / higher than the number of groups /
815	mask = ~`0UL`; / make all groups eligible /
816	else
817	mask = (`1UL` << last_flip_pos) - `1`;
818
819	qfq_move_groups(q, mask, src: IR, dst: ER);
820	qfq_move_groups(q, mask, src: IB, dst: EB);
821	}
822	}
823
824	/*
825	* The index of the slot in which the input aggregate agg is to be
826	* inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
827	* and not a '-1' because the start time of the group may be moved
828	* backward by one slot after the aggregate has been inserted, and
829	* this would cause non-empty slots to be right-shifted by one
830	* position.
831	*
832	* QFQ+ fully satisfies this bound to the slot index if the parameters
833	* of the classes are not changed dynamically, and if QFQ+ never
834	* happens to postpone the service of agg unjustly, i.e., it never
835	* happens that the aggregate becomes backlogged and eligible, or just
836	* eligible, while an aggregate with a higher approximated finish time
837	* is being served. In particular, in this case QFQ+ guarantees that
838	* the timestamps of agg are low enough that the slot index is never
839	* higher than 2. Unfortunately, QFQ+ cannot provide the same
840	* guarantee if it happens to unjustly postpone the service of agg, or
841	* if the parameters of some class are changed.
842	*
843	* As for the first event, i.e., an out-of-order service, the
844	* upper bound to the slot index guaranteed by QFQ+ grows to
845	* 2 +
846	* QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
847	* (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
848	*
849	* The following function deals with this problem by backward-shifting
850	* the timestamps of agg, if needed, so as to guarantee that the slot
851	* index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
852	* cause the service of other aggregates to be postponed, yet the
853	* worst-case guarantees of these aggregates are not violated. In
854	* fact, in case of no out-of-order service, the timestamps of agg
855	* would have been even lower than they are after the backward shift,
856	* because QFQ+ would have guaranteed a maximum value equal to 2 for
857	* the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
858	* service is postponed because of the backward-shift would have
859	* however waited for the service of agg before being served.
860	*
861	* The other event that may cause the slot index to be higher than 2
862	* for agg is a recent change of the parameters of some class. If the
863	* weight of a class is increased or the lmax (max_pkt_size) of the
864	* class is decreased, then a new aggregate with smaller slot size
865	* than the original parent aggregate of the class may happen to be
866	* activated. The activation of this aggregate should be properly
867	* delayed to when the service of the class has finished in the ideal
868	* system tracked by QFQ+. If the activation of the aggregate is not
869	* delayed to this reference time instant, then this aggregate may be
870	* unjustly served before other aggregates waiting for service. This
871	* may cause the above bound to the slot index to be violated for some
872	* of these unlucky aggregates.
873	*
874	* Instead of delaying the activation of the new aggregate, which is
875	* quite complex, the above-discussed capping of the slot index is
876	* used to handle also the consequences of a change of the parameters
877	* of a class.
878	*/
879	static void qfq_slot_insert(struct qfq_group grp, struct* qfq_aggregate *agg,
880	u64 roundedS)
881	{
882	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
883	unsigned int i; / slot index in the bucket list /
884
885	if (unlikely(slot > QFQ_MAX_SLOTS - `2`)) {
886	u64 deltaS = roundedS - grp->S -
887	((u64)(QFQ_MAX_SLOTS - `2`)<<grp->slot_shift);
888	agg->S -= deltaS;
889	agg->F -= deltaS;
890	slot = QFQ_MAX_SLOTS - `2`;
891	}
892
893	i = (grp->front + slot) % QFQ_MAX_SLOTS;
894
895	hlist_add_head(n: &agg->next, h: &grp->slots[i]);
896	__set_bit(slot, &grp->full_slots);
897	}
898
899	/ Maybe introduce hlist_first_entry?? /
900	static struct qfq_aggregate qfq_slot_head(struct* qfq_group *grp)
901	{
902	return hlist_entry(grp->slots[grp->front].first,
903	struct qfq_aggregate, next);
904	}
905
906	/*
907	* remove the entry from the slot
908	*/
909	static void qfq_front_slot_remove(struct qfq_group *grp)
910	{
911	struct qfq_aggregate *agg = qfq_slot_head(grp);
912
913	BUG_ON(!agg);
914	hlist_del(n: &agg->next);
915	if (hlist_empty(h: &grp->slots[grp->front]))
916	__clear_bit(`0`, &grp->full_slots);
917	}
918
919	/*
920	* Returns the first aggregate in the first non-empty bucket of the
921	* group. As a side effect, adjusts the bucket list so the first
922	* non-empty bucket is at position 0 in full_slots.
923	*/
924	static struct qfq_aggregate qfq_slot_scan(struct* qfq_group *grp)
925	{
926	unsigned int i;
927
928	pr_debug("qfq slot_scan: grp %u full %#lx\n",
929	grp->index, grp->full_slots);
930
931	if (grp->full_slots == `0`)
932	return NULL;
933
934	i = __ffs(grp->full_slots); / zero based /
935	if (i > `0`) {
936	grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
937	grp->full_slots >>= i;
938	}
939
940	return qfq_slot_head(grp);
941	}
942
943	/*
944	* adjust the bucket list. When the start time of a group decreases,
945	* we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
946	* move the objects. The mask of occupied slots must be shifted
947	* because we use ffs() to find the first non-empty slot.
948	* This covers decreases in the group's start time, but what about
949	* increases of the start time ?
950	* Here too we should make sure that i is less than 32
951	*/
952	static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
953	{
954	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
955
956	grp->full_slots <<= i;
957	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
958	}
959
960	static void qfq_update_eligible(struct qfq_sched *q)
961	{
962	struct qfq_group *grp;
963	unsigned long ineligible;
964
965	ineligible = q->bitmaps[IR] \| q->bitmaps[IB];
966	if (ineligible) {
967	if (!q->bitmaps[ER]) {
968	grp = qfq_ffs(q, bitmap: ineligible);
969	if (qfq_gt(a: grp->S, b: q->V))
970	q->V = grp->S;
971	}
972	qfq_make_eligible(q);
973	}
974	}
975
976	/ Dequeue head packet of the head class in the DRR queue of the aggregate. /
977	static struct sk_buff agg_dequeue(struct* qfq_aggregate *agg,
978	struct qfq_class cl, unsigned* int len)
979	{
980	struct sk_buff *skb = qdisc_dequeue_peeked(sch: cl->qdisc);
981
982	if (!skb)
983	return NULL;
984
985	cl->deficit -= (int) len;
986
987	if (cl->qdisc->q.qlen == `0`) / no more packets, remove from list /
988	list_del(entry: &cl->alist);
989	else if (cl->deficit < qdisc_pkt_len(skb: cl->qdisc->ops->peek(cl->qdisc))) {
990	cl->deficit += agg->lmax;
991	list_move_tail(list: &cl->alist, head: &agg->active);
992	}
993
994	return skb;
995	}
996
997	static inline struct sk_buff qfq_peek_skb(struct* qfq_aggregate *agg,
998	struct qfq_class **cl,
999	unsigned int *len)
1000	{
1001	struct sk_buff *skb;
1002
1003	cl = list_first_entry(&agg->active, struct* qfq_class, alist);
1004	skb = (cl)->qdisc->ops->peek((cl)->qdisc);
1005	if (skb == NULL)
1006	qdisc_warn_nonwc(txt: "qfq_dequeue", qdisc: (*cl)->qdisc);
1007	else
1008	*len = qdisc_pkt_len(skb);
1009
1010	return skb;
1011	}
1012
1013	/ Update F according to the actual service received by the aggregate. /
1014	static inline void charge_actual_service(struct qfq_aggregate *agg)
1015	{
1016	/ Compute the service received by the aggregate, taking into*
1017	* account that, after decreasing the number of classes in
1018	* agg, it may happen that
1019	* agg->initial_budget - agg->budget > agg->bugdetmax
1020	*/
1021	u32 service_received = min(agg->budgetmax,
1022	agg->initial_budget - agg->budget);
1023
1024	agg->F = agg->S + (u64)service_received * agg->inv_w;
1025	}
1026
1027	/ Assign a reasonable start time for a new aggregate in group i.*
1028	* Admissible values for \hat(F) are multiples of \sigma_i
1029	* no greater than V+\sigma_i . Larger values mean that
1030	* we had a wraparound so we consider the timestamp to be stale.
1031	*
1032	* If F is not stale and F >= V then we set S = F.
1033	* Otherwise we should assign S = V, but this may violate
1034	* the ordering in EB (see [2]). So, if we have groups in ER,
1035	* set S to the F_j of the first group j which would be blocking us.
1036	* We are guaranteed not to move S backward because
1037	* otherwise our group i would still be blocked.
1038	*/
1039	static void qfq_update_start(struct qfq_sched q, struct* qfq_aggregate *agg)
1040	{
1041	unsigned long mask;
1042	u64 limit, roundedF;
1043	int slot_shift = agg->grp->slot_shift;
1044
1045	roundedF = qfq_round_down(ts: agg->F, shift: slot_shift);
1046	limit = qfq_round_down(ts: q->V, shift: slot_shift) + (`1ULL` << slot_shift);
1047
1048	if (!qfq_gt(a: agg->F, b: q->V) \|\| qfq_gt(a: roundedF, b: limit)) {
1049	/ timestamp was stale /
1050	mask = mask_from(bitmap: q->bitmaps[ER], from: agg->grp->index);
1051	if (mask) {
1052	struct qfq_group *next = qfq_ffs(q, bitmap: mask);
1053	if (qfq_gt(a: roundedF, b: next->F)) {
1054	if (qfq_gt(a: limit, b: next->F))
1055	agg->S = next->F;
1056	else / preserve timestamp correctness /
1057	agg->S = limit;
1058	return;
1059	}
1060	}
1061	agg->S = q->V;
1062	} else / timestamp is not stale /
1063	agg->S = agg->F;
1064	}
1065
1066	/ Update the timestamps of agg before scheduling/rescheduling it for*
1067	* service. In particular, assign to agg->F its maximum possible
1068	* value, i.e., the virtual finish time with which the aggregate
1069	* should be labeled if it used all its budget once in service.
1070	*/
1071	static inline void
1072	qfq_update_agg_ts(struct qfq_sched *q,
1073	struct qfq_aggregate agg, enum* update_reason reason)
1074	{
1075	if (reason != requeue)
1076	qfq_update_start(q, agg);
1077	else / just charge agg for the service received /
1078	agg->S = agg->F;
1079
1080	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
1081	}
1082
1083	static void qfq_schedule_agg(struct qfq_sched q, struct* qfq_aggregate *agg);
1084
1085	static struct sk_buff qfq_dequeue(struct* Qdisc *sch)
1086	{
1087	struct qfq_sched *q = qdisc_priv(sch);
1088	struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
1089	struct qfq_class *cl;
1090	struct sk_buff *skb = NULL;
1091	/ next-packet len, 0 means no more active classes in in-service agg /
1092	unsigned int len = `0`;
1093
1094	if (in_serv_agg == NULL)
1095	return NULL;
1096
1097	if (!list_empty(head: &in_serv_agg->active))
1098	skb = qfq_peek_skb(agg: in_serv_agg, cl: &cl, len: &len);
1099
1100	/*
1101	* If there are no active classes in the in-service aggregate,
1102	* or if the aggregate has not enough budget to serve its next
1103	* class, then choose the next aggregate to serve.
1104	*/
1105	if (len == `0` \|\| in_serv_agg->budget < len) {
1106	charge_actual_service(agg: in_serv_agg);
1107
1108	/ recharge the budget of the aggregate /
1109	in_serv_agg->initial_budget = in_serv_agg->budget =
1110	in_serv_agg->budgetmax;
1111
1112	if (!list_empty(head: &in_serv_agg->active)) {
1113	/*
1114	* Still active: reschedule for
1115	* service. Possible optimization: if no other
1116	* aggregate is active, then there is no point
1117	* in rescheduling this aggregate, and we can
1118	* just keep it as the in-service one. This
1119	* should be however a corner case, and to
1120	* handle it, we would need to maintain an
1121	* extra num_active_aggs field.
1122	*/
1123	qfq_update_agg_ts(q, agg: in_serv_agg, reason: requeue);
1124	qfq_schedule_agg(q, agg: in_serv_agg);
1125	} else if (sch->q.qlen == `0`) { / no aggregate to serve /
1126	q->in_serv_agg = NULL;
1127	return NULL;
1128	}
1129
1130	/*
1131	* If we get here, there are other aggregates queued:
1132	* choose the new aggregate to serve.
1133	*/
1134	in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
1135	skb = qfq_peek_skb(agg: in_serv_agg, cl: &cl, len: &len);
1136	}
1137	if (!skb)
1138	return NULL;
1139
1140	sch->q.qlen--;
1141
1142	skb = agg_dequeue(agg: in_serv_agg, cl, len);
1143
1144	if (!skb) {
1145	sch->q.qlen++;
1146	return NULL;
1147	}
1148
1149	qdisc_qstats_backlog_dec(sch, skb);
1150	qdisc_bstats_update(sch, skb);
1151
1152	/ If lmax is lowered, through qfq_change_class, for a class*
1153	* owning pending packets with larger size than the new value
1154	* of lmax, then the following condition may hold.
1155	*/
1156	if (unlikely(in_serv_agg->budget < len))
1157	in_serv_agg->budget = `0`;
1158	else
1159	in_serv_agg->budget -= len;
1160
1161	q->V += (u64)len * q->iwsum;
1162	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
1163	len, (unsigned long long) in_serv_agg->F,
1164	(unsigned long long) q->V);
1165
1166	return skb;
1167	}
1168
1169	static struct qfq_aggregate qfq_choose_next_agg(struct* qfq_sched *q)
1170	{
1171	struct qfq_group *grp;
1172	struct qfq_aggregate agg, new_front_agg;
1173	u64 old_F;
1174
1175	qfq_update_eligible(q);
1176	q->oldV = q->V;
1177
1178	if (!q->bitmaps[ER])
1179	return NULL;
1180
1181	grp = qfq_ffs(q, bitmap: q->bitmaps[ER]);
1182	old_F = grp->F;
1183
1184	agg = qfq_slot_head(grp);
1185
1186	/ agg starts to be served, remove it from schedule /
1187	qfq_front_slot_remove(grp);
1188
1189	new_front_agg = qfq_slot_scan(grp);
1190
1191	if (new_front_agg == NULL) / group is now inactive, remove from ER /
1192	__clear_bit(grp->index, &q->bitmaps[ER]);
1193	else {
1194	u64 roundedS = qfq_round_down(ts: new_front_agg->S,
1195	shift: grp->slot_shift);
1196	unsigned int s;
1197
1198	if (grp->S == roundedS)
1199	return agg;
1200	grp->S = roundedS;
1201	grp->F = roundedS + (`2ULL` << grp->slot_shift);
1202	__clear_bit(grp->index, &q->bitmaps[ER]);
1203	s = qfq_calc_state(q, grp);
1204	__set_bit(grp->index, &q->bitmaps[s]);
1205	}
1206
1207	qfq_unblock_groups(q, index: grp->index, old_F);
1208
1209	return agg;
1210	}
1211
1212	static int qfq_enqueue(struct sk_buff skb, struct* Qdisc *sch,
1213	struct sk_buff **to_free)
1214	{
1215	unsigned int len = qdisc_pkt_len(skb), gso_segs;
1216	struct qfq_sched *q = qdisc_priv(sch);
1217	struct qfq_class *cl;
1218	struct qfq_aggregate *agg;
1219	int err = `0`;
1220	bool first;
1221
1222	cl = qfq_classify(skb, sch, qerr: &err);
1223	if (cl == NULL) {
1224	if (err & __NET_XMIT_BYPASS)
1225	qdisc_qstats_drop(sch);
1226	__qdisc_drop(skb, to_free);
1227	return err;
1228	}
1229	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
1230
1231	if (unlikely(cl->agg->lmax < len)) {
1232	pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
1233	cl->agg->lmax, len, cl->common.classid);
1234	err = qfq_change_agg(sch, cl, weight: cl->agg->class_weight, lmax: len);
1235	if (err) {
1236	cl->qstats.drops++;
1237	return qdisc_drop(skb, sch, to_free);
1238	}
1239	}
1240
1241	gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : `1`;
1242	first = !cl->qdisc->q.qlen;
1243	err = qdisc_enqueue(skb, sch: cl->qdisc, to_free);
1244	if (unlikely(err != NET_XMIT_SUCCESS)) {
1245	pr_debug("qfq_enqueue: enqueue failed %d\n", err);
1246	if (net_xmit_drop_count(err)) {
1247	cl->qstats.drops++;
1248	qdisc_qstats_drop(sch);
1249	}
1250	return err;
1251	}
1252
1253	_bstats_update(bstats: &cl->bstats, bytes: len, packets: gso_segs);
1254	sch->qstats.backlog += len;
1255	++sch->q.qlen;
1256
1257	agg = cl->agg;
1258	/ if the queue was not empty, then done here /
1259	if (!first) {
1260	if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
1261	list_first_entry(&agg->active, struct qfq_class, alist)
1262	== cl && cl->deficit < len)
1263	list_move_tail(list: &cl->alist, head: &agg->active);
1264
1265	return err;
1266	}
1267
1268	/ schedule class for service within the aggregate /
1269	cl->deficit = agg->lmax;
1270	list_add_tail(new: &cl->alist, head: &agg->active);
1271
1272	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl \|\|
1273	q->in_serv_agg == agg)
1274	return err; / non-empty or in service, nothing else to do /
1275
1276	qfq_activate_agg(q, agg, enqueue);
1277
1278	return err;
1279	}
1280
1281	/*
1282	* Schedule aggregate according to its timestamps.
1283	*/
1284	static void qfq_schedule_agg(struct qfq_sched q, struct* qfq_aggregate *agg)
1285	{
1286	struct qfq_group *grp = agg->grp;
1287	u64 roundedS;
1288	int s;
1289
1290	roundedS = qfq_round_down(ts: agg->S, shift: grp->slot_shift);
1291
1292	/*
1293	* Insert agg in the correct bucket.
1294	* If agg->S >= grp->S we don't need to adjust the
1295	* bucket list and simply go to the insertion phase.
1296	* Otherwise grp->S is decreasing, we must make room
1297	* in the bucket list, and also recompute the group state.
1298	* Finally, if there were no flows in this group and nobody
1299	* was in ER make sure to adjust V.
1300	*/
1301	if (grp->full_slots) {
1302	if (!qfq_gt(a: grp->S, b: agg->S))
1303	goto skip_update;
1304
1305	/ create a slot for this agg->S /
1306	qfq_slot_rotate(grp, roundedS);
1307	/ group was surely ineligible, remove /
1308	__clear_bit(grp->index, &q->bitmaps[IR]);
1309	__clear_bit(grp->index, &q->bitmaps[IB]);
1310	} else if (!q->bitmaps[ER] && qfq_gt(a: roundedS, b: q->V) &&
1311	q->in_serv_agg == NULL)
1312	q->V = roundedS;
1313
1314	grp->S = roundedS;
1315	grp->F = roundedS + (`2ULL` << grp->slot_shift);
1316	s = qfq_calc_state(q, grp);
1317	__set_bit(grp->index, &q->bitmaps[s]);
1318
1319	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
1320	s, q->bitmaps[s],
1321	(unsigned long long) agg->S,
1322	(unsigned long long) agg->F,
1323	(unsigned long long) q->V);
1324
1325	skip_update:
1326	qfq_slot_insert(grp, agg, roundedS);
1327	}
1328
1329
1330	/ Update agg ts and schedule agg for service /
1331	static void qfq_activate_agg(struct qfq_sched q, struct* qfq_aggregate *agg,
1332	enum update_reason reason)
1333	{
1334	agg->initial_budget = agg->budget = agg->budgetmax; / recharge budg. /
1335
1336	qfq_update_agg_ts(q, agg, reason);
1337	if (q->in_serv_agg == NULL) { / no aggr. in service or scheduled /
1338	q->in_serv_agg = agg; / start serving this aggregate /
1339	/ update V: to be in service, agg must be eligible /
1340	q->oldV = q->V = agg->S;
1341	} else if (agg != q->in_serv_agg)
1342	qfq_schedule_agg(q, agg);
1343	}
1344
1345	static void qfq_slot_remove(struct qfq_sched q, struct* qfq_group *grp,
1346	struct qfq_aggregate *agg)
1347	{
1348	unsigned int i, offset;
1349	u64 roundedS;
1350
1351	roundedS = qfq_round_down(ts: agg->S, shift: grp->slot_shift);
1352	offset = (roundedS - grp->S) >> grp->slot_shift;
1353
1354	i = (grp->front + offset) % QFQ_MAX_SLOTS;
1355
1356	hlist_del(n: &agg->next);
1357	if (hlist_empty(h: &grp->slots[i]))
1358	__clear_bit(offset, &grp->full_slots);
1359	}
1360
1361	/*
1362	* Called to forcibly deschedule an aggregate. If the aggregate is
1363	* not in the front bucket, or if the latter has other aggregates in
1364	* the front bucket, we can simply remove the aggregate with no other
1365	* side effects.
1366	* Otherwise we must propagate the event up.
1367	*/
1368	static void qfq_deactivate_agg(struct qfq_sched q, struct* qfq_aggregate *agg)
1369	{
1370	struct qfq_group *grp = agg->grp;
1371	unsigned long mask;
1372	u64 roundedS;
1373	int s;
1374
1375	if (agg == q->in_serv_agg) {
1376	charge_actual_service(agg);
1377	q->in_serv_agg = qfq_choose_next_agg(q);
1378	return;
1379	}
1380
1381	agg->F = agg->S;
1382	qfq_slot_remove(q, grp, agg);
1383
1384	if (!grp->full_slots) {
1385	__clear_bit(grp->index, &q->bitmaps[IR]);
1386	__clear_bit(grp->index, &q->bitmaps[EB]);
1387	__clear_bit(grp->index, &q->bitmaps[IB]);
1388
1389	if (test_bit(grp->index, &q->bitmaps[ER]) &&
1390	!(q->bitmaps[ER] & ~((`1UL` << grp->index) - `1`))) {
1391	mask = q->bitmaps[ER] & ((`1UL` << grp->index) - `1`);
1392	if (mask)
1393	mask = ~((`1UL` << __fls(word: mask)) - `1`);
1394	else
1395	mask = ~`0UL`;
1396	qfq_move_groups(q, mask, src: EB, dst: ER);
1397	qfq_move_groups(q, mask, src: IB, dst: IR);
1398	}
1399	__clear_bit(grp->index, &q->bitmaps[ER]);
1400	} else if (hlist_empty(h: &grp->slots[grp->front])) {
1401	agg = qfq_slot_scan(grp);
1402	roundedS = qfq_round_down(ts: agg->S, shift: grp->slot_shift);
1403	if (grp->S != roundedS) {
1404	__clear_bit(grp->index, &q->bitmaps[ER]);
1405	__clear_bit(grp->index, &q->bitmaps[IR]);
1406	__clear_bit(grp->index, &q->bitmaps[EB]);
1407	__clear_bit(grp->index, &q->bitmaps[IB]);
1408	grp->S = roundedS;
1409	grp->F = roundedS + (`2ULL` << grp->slot_shift);
1410	s = qfq_calc_state(q, grp);
1411	__set_bit(grp->index, &q->bitmaps[s]);
1412	}
1413	}
1414	}
1415
1416	static void qfq_qlen_notify(struct Qdisc sch, unsigned* long arg)
1417	{
1418	struct qfq_sched *q = qdisc_priv(sch);
1419	struct qfq_class cl = (struct* qfq_class *)arg;
1420
1421	qfq_deactivate_class(q, cl);
1422	}
1423
1424	static int qfq_init_qdisc(struct Qdisc sch, struct* nlattr *opt,
1425	struct netlink_ext_ack *extack)
1426	{
1427	struct qfq_sched *q = qdisc_priv(sch);
1428	struct qfq_group *grp;
1429	int i, j, err;
1430	u32 max_cl_shift, maxbudg_shift, max_classes;
1431
1432	err = tcf_block_get(p_block: &q->block, p_filter_chain: &q->filter_list, q: sch, extack);
1433	if (err)
1434	return err;
1435
1436	err = qdisc_class_hash_init(&q->clhash);
1437	if (err < `0`)
1438	return err;
1439
1440	max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + `1`,
1441	QFQ_MAX_AGG_CLASSES);
1442	/ max_cl_shift = floor(log_2(max_classes)) /
1443	max_cl_shift = __fls(word: max_classes);
1444	q->max_agg_classes = `1`<<max_cl_shift;
1445
1446	/ maxbudg_shift = log2(max_len * max_classes_per_agg) /
1447	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
1448	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
1449
1450	for (i = `0`; i <= QFQ_MAX_INDEX; i++) {
1451	grp = &q->groups[i];
1452	grp->index = i;
1453	grp->slot_shift = q->min_slot_shift + i;
1454	for (j = `0`; j < QFQ_MAX_SLOTS; j++)
1455	INIT_HLIST_HEAD(&grp->slots[j]);
1456	}
1457
1458	INIT_HLIST_HEAD(&q->nonfull_aggs);
1459
1460	return `0`;
1461	}
1462
1463	static void qfq_reset_qdisc(struct Qdisc *sch)
1464	{
1465	struct qfq_sched *q = qdisc_priv(sch);
1466	struct qfq_class *cl;
1467	unsigned int i;
1468
1469	for (i = `0`; i < q->clhash.hashsize; i++) {
1470	hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1471	if (cl->qdisc->q.qlen > `0`)
1472	qfq_deactivate_class(q, cl);
1473
1474	qdisc_reset(qdisc: cl->qdisc);
1475	}
1476	}
1477	}
1478
1479	static void qfq_destroy_qdisc(struct Qdisc *sch)
1480	{
1481	struct qfq_sched *q = qdisc_priv(sch);
1482	struct qfq_class *cl;
1483	struct hlist_node *next;
1484	unsigned int i;
1485
1486	tcf_block_put(block: q->block);
1487
1488	for (i = `0`; i < q->clhash.hashsize; i++) {
1489	hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1490	common.hnode) {
1491	qfq_destroy_class(sch, cl);
1492	}
1493	}
1494	qdisc_class_hash_destroy(&q->clhash);
1495	}
1496
1497	static const struct Qdisc_class_ops qfq_class_ops = {
1498	.change = qfq_change_class,
1499	.delete = qfq_delete_class,
1500	.find = qfq_search_class,
1501	.tcf_block = qfq_tcf_block,
1502	.bind_tcf = qfq_bind_tcf,
1503	.unbind_tcf = qfq_unbind_tcf,
1504	.graft = qfq_graft_class,
1505	.leaf = qfq_class_leaf,
1506	.qlen_notify = qfq_qlen_notify,
1507	.dump = qfq_dump_class,
1508	.dump_stats = qfq_dump_class_stats,
1509	.walk = qfq_walk,
1510	};
1511
1512	static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
1513	.cl_ops = &qfq_class_ops,
1514	.id = "qfq",
1515	.priv_size = sizeof(struct qfq_sched),
1516	.enqueue = qfq_enqueue,
1517	.dequeue = qfq_dequeue,
1518	.peek = qdisc_peek_dequeued,
1519	.init = qfq_init_qdisc,
1520	.reset = qfq_reset_qdisc,
1521	.destroy = qfq_destroy_qdisc,
1522	.owner = THIS_MODULE,
1523	};
1524
1525	static int __init qfq_init(void)
1526	{
1527	return register_qdisc(qops: &qfq_qdisc_ops);
1528	}
1529
1530	static void __exit qfq_exit(void)
1531	{
1532	unregister_qdisc(qops: &qfq_qdisc_ops);
1533	}
1534
1535	module_init(qfq_init);
1536	module_exit(qfq_exit);
1537	MODULE_LICENSE("GPL");
1538

source code of linux/net/sched/sch_qfq.c