sch_cake.c source code [linux/net/sched/sch_cake.c]

1	// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3	/ COMMON Applications Kept Enhanced (CAKE) discipline*
4	*
5	* Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6	* Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7	* Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8	* Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9	* (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10	* Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11	*
12	* The CAKE Principles:
13	* (or, how to have your cake and eat it too)
14	*
15	* This is a combination of several shaping, AQM and FQ techniques into one
16	* easy-to-use package:
17	*
18	* - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19	* equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
20	* eliminating the need for any sort of burst parameter (eg. token bucket
21	* depth). Burst support is limited to that necessary to overcome scheduling
22	* latency.
23	*
24	* - A Diffserv-aware priority queue, giving more priority to certain classes,
25	* up to a specified fraction of bandwidth. Above that bandwidth threshold,
26	* the priority is reduced to avoid starving other tins.
27	*
28	* - Each priority tin has a separate Flow Queue system, to isolate traffic
29	* flows from each other. This prevents a burst on one flow from increasing
30	* the delay to another. Flows are distributed to queues using a
31	* set-associative hash function.
32	*
33	* - Each queue is actively managed by Cobalt, which is a combination of the
34	* Codel and Blue AQM algorithms. This serves flows fairly, and signals
35	* congestion early via ECN (if available) and/or packet drops, to keep
36	* latency low. The codel parameters are auto-tuned based on the bandwidth
37	* setting, as is necessary at low bandwidths.
38	*
39	* The configuration parameters are kept deliberately simple for ease of use.
40	* Everything has sane defaults. Complete generality of configuration is not
41	* a goal.
42	*
43	* The priority queue operates according to a weighted DRR scheme, combined with
44	* a bandwidth tracker which reuses the shaper logic to detect which side of the
45	* bandwidth sharing threshold the tin is operating. This determines whether a
46	* priority-based weight (high) or a bandwidth-based weight (low) is used for
47	* that tin in the current pass.
48	*
49	* This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50	* granted us permission to leverage.
51	*/
52
53	#include <linux/module.h>
54	#include <linux/types.h>
55	#include <linux/kernel.h>
56	#include <linux/jiffies.h>
57	#include <linux/string.h>
58	#include <linux/in.h>
59	#include <linux/errno.h>
60	#include <linux/init.h>
61	#include <linux/skbuff.h>
62	#include <linux/jhash.h>
63	#include <linux/slab.h>
64	#include <linux/vmalloc.h>
65	#include <linux/reciprocal_div.h>
66	#include <net/netlink.h>
67	#include <linux/if_vlan.h>
68	#include <net/gso.h>
69	#include <net/pkt_sched.h>
70	#include <net/pkt_cls.h>
71	#include <net/tcp.h>
72	#include <net/flow_dissector.h>
73
74	#if IS_ENABLED(CONFIG_NF_CONNTRACK)
75	#include <net/netfilter/nf_conntrack_core.h>
76	#endif
77
78	#define CAKE_SET_WAYS (8)
79	#define CAKE_MAX_TINS (8)
80	#define CAKE_QUEUES (1024)
81	#define CAKE_FLOW_MASK 63
82	#define CAKE_FLOW_NAT_FLAG 64
83
84	/ struct cobalt_params - contains codel and blue parameters*
85	* @interval: codel initial drop rate
86	* @target: maximum persistent sojourn time & blue update rate
87	* @mtu_time: serialisation delay of maximum-size packet
88	* @p_inc: increment of blue drop probability (0.32 fxp)
89	* @p_dec: decrement of blue drop probability (0.32 fxp)
90	*/
91	struct cobalt_params {
92	u64 interval;
93	u64 target;
94	u64 mtu_time;
95	u32 p_inc;
96	u32 p_dec;
97	};
98
99	/ struct cobalt_vars - contains codel and blue variables*
100	* @count: codel dropping frequency
101	* @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
102	* @drop_next: time to drop next packet, or when we dropped last
103	* @blue_timer: Blue time to next drop
104	* @p_drop: BLUE drop probability (0.32 fxp)
105	* @dropping: set if in dropping state
106	* @ecn_marked: set if marked
107	*/
108	struct cobalt_vars {
109	u32 count;
110	u32 rec_inv_sqrt;
111	ktime_t drop_next;
112	ktime_t blue_timer;
113	u32 p_drop;
114	bool dropping;
115	bool ecn_marked;
116	};
117
118	enum {
119	CAKE_SET_NONE = `0`,
120	CAKE_SET_SPARSE,
121	CAKE_SET_SPARSE_WAIT, / counted in SPARSE, actually in BULK /
122	CAKE_SET_BULK,
123	CAKE_SET_DECAYING
124	};
125
126	struct cake_flow {
127	/ this stuff is all needed per-flow at dequeue time /
128	struct sk_buff *head;
129	struct sk_buff *tail;
130	struct list_head flowchain;
131	s32 deficit;
132	u32 dropped;
133	struct cobalt_vars cvars;
134	u16 srchost; / index into cake_host table /
135	u16 dsthost;
136	u8 set;
137	}; / please try to keep this structure <= 64 bytes /
138
139	struct cake_host {
140	u32 srchost_tag;
141	u32 dsthost_tag;
142	u16 srchost_bulk_flow_count;
143	u16 dsthost_bulk_flow_count;
144	};
145
146	struct cake_heap_entry {
147	u16 t:`3`, b:`10`;
148	};
149
150	struct cake_tin_data {
151	struct cake_flow flows[CAKE_QUEUES];
152	u32 backlogs[CAKE_QUEUES];
153	u32 tags[CAKE_QUEUES]; / for set association /
154	u16 overflow_idx[CAKE_QUEUES];
155	struct cake_host hosts[CAKE_QUEUES]; / for triple isolation /
156	u16 flow_quantum;
157
158	struct cobalt_params cparams;
159	u32 drop_overlimit;
160	u16 bulk_flow_count;
161	u16 sparse_flow_count;
162	u16 decaying_flow_count;
163	u16 unresponsive_flow_count;
164
165	u32 max_skblen;
166
167	struct list_head new_flows;
168	struct list_head old_flows;
169	struct list_head decaying_flows;
170
171	/ time_next = time_this + ((len * rate_ns) >> rate_shft) /
172	ktime_t time_next_packet;
173	u64 tin_rate_ns;
174	u64 tin_rate_bps;
175	u16 tin_rate_shft;
176
177	u16 tin_quantum;
178	s32 tin_deficit;
179	u32 tin_backlog;
180	u32 tin_dropped;
181	u32 tin_ecn_mark;
182
183	u32 packets;
184	u64 bytes;
185
186	u32 ack_drops;
187
188	/ moving averages /
189	u64 avge_delay;
190	u64 peak_delay;
191	u64 base_delay;
192
193	/ hash function stats /
194	u32 way_directs;
195	u32 way_hits;
196	u32 way_misses;
197	u32 way_collisions;
198	}; / number of tins is small, so size of this struct doesn't matter much /
199
200	struct cake_sched_data {
201	struct tcf_proto __rcu filter_list; /* optional external classifier /
202	struct tcf_block *block;
203	struct cake_tin_data *tins;
204
205	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206	u16 overflow_timeout;
207
208	u16 tin_cnt;
209	u8 tin_mode;
210	u8 flow_mode;
211	u8 ack_filter;
212	u8 atm_mode;
213
214	u32 fwmark_mask;
215	u16 fwmark_shft;
216
217	/ time_next = time_this + ((len * rate_ns) >> rate_shft) /
218	u16 rate_shft;
219	ktime_t time_next_packet;
220	ktime_t failsafe_next_packet;
221	u64 rate_ns;
222	u64 rate_bps;
223	u16 rate_flags;
224	s16 rate_overhead;
225	u16 rate_mpu;
226	u64 interval;
227	u64 target;
228
229	/ resource tracking /
230	u32 buffer_used;
231	u32 buffer_max_used;
232	u32 buffer_limit;
233	u32 buffer_config_limit;
234
235	/ indices for dequeue /
236	u16 cur_tin;
237	u16 cur_flow;
238
239	struct qdisc_watchdog watchdog;
240	const u8 *tin_index;
241	const u8 *tin_order;
242
243	/ bandwidth capacity estimate /
244	ktime_t last_packet_time;
245	ktime_t avg_window_begin;
246	u64 avg_packet_interval;
247	u64 avg_window_bytes;
248	u64 avg_peak_bandwidth;
249	ktime_t last_reconfig_time;
250
251	/ packet length stats /
252	u32 avg_netoff;
253	u16 max_netlen;
254	u16 max_adjlen;
255	u16 min_netlen;
256	u16 min_adjlen;
257	};
258
259	enum {
260	CAKE_FLAG_OVERHEAD = BIT(`0`),
261	CAKE_FLAG_AUTORATE_INGRESS = BIT(`1`),
262	CAKE_FLAG_INGRESS = BIT(`2`),
263	CAKE_FLAG_WASH = BIT(`3`),
264	CAKE_FLAG_SPLIT_GSO = BIT(`4`)
265	};
266
267	/ COBALT operates the Codel and BLUE algorithms in parallel, in order to*
268	* obtain the best features of each. Codel is excellent on flows which
269	* respond to congestion signals in a TCP-like way. BLUE is more effective on
270	* unresponsive flows.
271	*/
272
273	struct cobalt_skb_cb {
274	ktime_t enqueue_time;
275	u32 adjusted_len;
276	};
277
278	static u64 us_to_ns(u64 us)
279	{
280	return us * NSEC_PER_USEC;
281	}
282
283	static struct cobalt_skb_cb get_cobalt_cb(const* struct sk_buff *skb)
284	{
285	qdisc_cb_private_validate(skb, sz: sizeof(struct cobalt_skb_cb));
286	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287	}
288
289	static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290	{
291	return get_cobalt_cb(skb)->enqueue_time;
292	}
293
294	static void cobalt_set_enqueue_time(struct sk_buff *skb,
295	ktime_t now)
296	{
297	get_cobalt_cb(skb)->enqueue_time = now;
298	}
299
300	static u16 quantum_div[CAKE_QUEUES + `1`] = {`0`};
301
302	/ Diffserv lookup tables /
303
304	static const u8 precedence[] = {
305	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
306	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
307	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
308	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`,
309	`4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`,
310	`5`, `5`, `5`, `5`, `5`, `5`, `5`, `5`,
311	`6`, `6`, `6`, `6`, `6`, `6`, `6`, `6`,
312	`7`, `7`, `7`, `7`, `7`, `7`, `7`, `7`,
313	};
314
315	static const u8 diffserv8[] = {
316	`2`, `0`, `1`, `2`, `4`, `2`, `2`, `2`,
317	`1`, `2`, `1`, `2`, `1`, `2`, `1`, `2`,
318	`5`, `2`, `4`, `2`, `4`, `2`, `4`, `2`,
319	`3`, `2`, `3`, `2`, `3`, `2`, `3`, `2`,
320	`6`, `2`, `3`, `2`, `3`, `2`, `3`, `2`,
321	`6`, `2`, `2`, `2`, `6`, `2`, `6`, `2`,
322	`7`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
323	`7`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
324	};
325
326	static const u8 diffserv4[] = {
327	`0`, `1`, `0`, `0`, `2`, `0`, `0`, `0`,
328	`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
329	`2`, `0`, `2`, `0`, `2`, `0`, `2`, `0`,
330	`2`, `0`, `2`, `0`, `2`, `0`, `2`, `0`,
331	`3`, `0`, `2`, `0`, `2`, `0`, `2`, `0`,
332	`3`, `0`, `0`, `0`, `3`, `0`, `3`, `0`,
333	`3`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
334	`3`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
335	};
336
337	static const u8 diffserv3[] = {
338	`0`, `1`, `0`, `0`, `2`, `0`, `0`, `0`,
339	`1`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
340	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
341	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
342	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
343	`0`, `0`, `0`, `0`, `2`, `0`, `2`, `0`,
344	`2`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
345	`2`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
346	};
347
348	static const u8 besteffort[] = {
349	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
350	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
351	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
352	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
353	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
354	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
355	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
356	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
357	};
358
359	/ tin priority order for stats dumping /
360
361	static const u8 normal_order[] = {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`};
362	static const u8 bulk_order[] = {`1`, `0`, `2`, `3`};
363
364	#define REC_INV_SQRT_CACHE (16)
365	static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {`0`};
366
367	/ http://en.wikipedia.org/wiki/Methods_of_computing_square_roots*
368	* new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
369	*
370	* Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
371	*/
372
373	static void cobalt_newton_step(struct cobalt_vars *vars)
374	{
375	u32 invsqrt, invsqrt2;
376	u64 val;
377
378	invsqrt = vars->rec_inv_sqrt;
379	invsqrt2 = ((u64)invsqrt * invsqrt) >> `32`;
380	val = (`3LL` << `32`) - ((u64)vars->count * invsqrt2);
381
382	val >>= `2`; / avoid overflow in following multiply /
383	val = (val * invsqrt) >> (`32` - `2` + `1`);
384
385	vars->rec_inv_sqrt = val;
386	}
387
388	static void cobalt_invsqrt(struct cobalt_vars *vars)
389	{
390	if (vars->count < REC_INV_SQRT_CACHE)
391	vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
392	else
393	cobalt_newton_step(vars);
394	}
395
396	/ There is a big difference in timing between the accurate values placed in*
397	* the cache and the approximations given by a single Newton step for small
398	* count values, particularly when stepping from count 1 to 2 or vice versa.
399	* Above 16, a single Newton step gives sufficient accuracy in either
400	* direction, given the precision stored.
401	*
402	* The magnitude of the error when stepping up to count 2 is such as to give
403	* the value that should have been produced at count 4.
404	*/
405
406	static void cobalt_cache_init(void)
407	{
408	struct cobalt_vars v;
409
410	memset(&v, `0`, sizeof(v));
411	v.rec_inv_sqrt = ~`0U`;
412	cobalt_rec_inv_sqrt_cache[`0`] = v.rec_inv_sqrt;
413
414	for (v.count = `1`; v.count < REC_INV_SQRT_CACHE; v.count++) {
415	cobalt_newton_step(vars: &v);
416	cobalt_newton_step(vars: &v);
417	cobalt_newton_step(vars: &v);
418	cobalt_newton_step(vars: &v);
419
420	cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
421	}
422	}
423
424	static void cobalt_vars_init(struct cobalt_vars *vars)
425	{
426	memset(vars, `0`, sizeof(*vars));
427
428	if (!cobalt_rec_inv_sqrt_cache[`0`]) {
429	cobalt_cache_init();
430	cobalt_rec_inv_sqrt_cache[`0`] = ~`0`;
431	}
432	}
433
434	/ CoDel control_law is t + interval/sqrt(count)*
435	* We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
436	* both sqrt() and divide operation.
437	*/
438	static ktime_t cobalt_control(ktime_t t,
439	u64 interval,
440	u32 rec_inv_sqrt)
441	{
442	return ktime_add_ns(t, reciprocal_scale(interval,
443	rec_inv_sqrt));
444	}
445
446	/ Call this when a packet had to be dropped due to queue overflow. Returns*
447	* true if the BLUE state was quiescent before but active after this call.
448	*/
449	static bool cobalt_queue_full(struct cobalt_vars *vars,
450	struct cobalt_params *p,
451	ktime_t now)
452	{
453	bool up = false;
454
455	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
456	up = !vars->p_drop;
457	vars->p_drop += p->p_inc;
458	if (vars->p_drop < p->p_inc)
459	vars->p_drop = ~`0`;
460	vars->blue_timer = now;
461	}
462	vars->dropping = true;
463	vars->drop_next = now;
464	if (!vars->count)
465	vars->count = `1`;
466
467	return up;
468	}
469
470	/ Call this when the queue was serviced but turned out to be empty. Returns*
471	* true if the BLUE state was active before but quiescent after this call.
472	*/
473	static bool cobalt_queue_empty(struct cobalt_vars *vars,
474	struct cobalt_params *p,
475	ktime_t now)
476	{
477	bool down = false;
478
479	if (vars->p_drop &&
480	ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
481	if (vars->p_drop < p->p_dec)
482	vars->p_drop = `0`;
483	else
484	vars->p_drop -= p->p_dec;
485	vars->blue_timer = now;
486	down = !vars->p_drop;
487	}
488	vars->dropping = false;
489
490	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= `0`) {
491	vars->count--;
492	cobalt_invsqrt(vars);
493	vars->drop_next = cobalt_control(t: vars->drop_next,
494	interval: p->interval,
495	rec_inv_sqrt: vars->rec_inv_sqrt);
496	}
497
498	return down;
499	}
500
501	/ Call this with a freshly dequeued packet for possible congestion marking.*
502	* Returns true as an instruction to drop the packet, false for delivery.
503	*/
504	static bool cobalt_should_drop(struct cobalt_vars *vars,
505	struct cobalt_params *p,
506	ktime_t now,
507	struct sk_buff *skb,
508	u32 bulk_flows)
509	{
510	bool next_due, over_target, drop = false;
511	ktime_t schedule;
512	u64 sojourn;
513
514	/ The 'schedule' variable records, in its sign, whether 'now' is before or*
515	* after 'drop_next'. This allows 'drop_next' to be updated before the next
516	* scheduling decision is actually branched, without destroying that
517	* information. Similarly, the first 'schedule' value calculated is preserved
518	* in the boolean 'next_due'.
519	*
520	* As for 'drop_next', we take advantage of the fact that 'interval' is both
521	* the delay between first exceeding 'target' and the first signalling event,
522	* and the scaling factor for the signalling frequency. It's therefore very
523	* natural to use a single mechanism for both purposes, and eliminates a
524	* significant amount of reference Codel's spaghetti code. To help with this,
525	* both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
526	* as possible to 1.0 in fixed-point.
527	*/
528
529	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
530	schedule = ktime_sub(now, vars->drop_next);
531	over_target = sojourn > p->target &&
532	sojourn > p->mtu_time * bulk_flows * `2` &&
533	sojourn > p->mtu_time * `4`;
534	next_due = vars->count && ktime_to_ns(kt: schedule) >= `0`;
535
536	vars->ecn_marked = false;
537
538	if (over_target) {
539	if (!vars->dropping) {
540	vars->dropping = true;
541	vars->drop_next = cobalt_control(t: now,
542	interval: p->interval,
543	rec_inv_sqrt: vars->rec_inv_sqrt);
544	}
545	if (!vars->count)
546	vars->count = `1`;
547	} else if (vars->dropping) {
548	vars->dropping = false;
549	}
550
551	if (next_due && vars->dropping) {
552	/ Use ECN mark if possible, otherwise drop /
553	drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
554
555	vars->count++;
556	if (!vars->count)
557	vars->count--;
558	cobalt_invsqrt(vars);
559	vars->drop_next = cobalt_control(t: vars->drop_next,
560	interval: p->interval,
561	rec_inv_sqrt: vars->rec_inv_sqrt);
562	schedule = ktime_sub(now, vars->drop_next);
563	} else {
564	while (next_due) {
565	vars->count--;
566	cobalt_invsqrt(vars);
567	vars->drop_next = cobalt_control(t: vars->drop_next,
568	interval: p->interval,
569	rec_inv_sqrt: vars->rec_inv_sqrt);
570	schedule = ktime_sub(now, vars->drop_next);
571	next_due = vars->count && ktime_to_ns(kt: schedule) >= `0`;
572	}
573	}
574
575	/ Simple BLUE implementation. Lack of ECN is deliberate. /
576	if (vars->p_drop)
577	drop \|= (get_random_u32() < vars->p_drop);
578
579	/ Overload the drop_next field as an activity timeout /
580	if (!vars->count)
581	vars->drop_next = ktime_add_ns(now, p->interval);
582	else if (ktime_to_ns(kt: schedule) > `0` && !drop)
583	vars->drop_next = now;
584
585	return drop;
586	}
587
588	static bool cake_update_flowkeys(struct flow_keys *keys,
589	const struct sk_buff *skb)
590	{
591	#if IS_ENABLED(CONFIG_NF_CONNTRACK)
592	struct nf_conntrack_tuple tuple = {};
593	bool rev = !skb->_nfct, upd = false;
594	__be32 ip;
595
596	if (skb_protocol(skb, skip_vlan: true) != htons(ETH_P_IP))
597	return false;
598
599	if (!nf_ct_get_tuple_skb(dst_tuple: &tuple, skb))
600	return false;
601
602	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
603	if (ip != keys->addrs.v4addrs.src) {
604	keys->addrs.v4addrs.src = ip;
605	upd = true;
606	}
607	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
608	if (ip != keys->addrs.v4addrs.dst) {
609	keys->addrs.v4addrs.dst = ip;
610	upd = true;
611	}
612
613	if (keys->ports.ports) {
614	__be16 port;
615
616	port = rev ? tuple.dst.u.all : tuple.src.u.all;
617	if (port != keys->ports.src) {
618	keys->ports.src = port;
619	upd = true;
620	}
621	port = rev ? tuple.src.u.all : tuple.dst.u.all;
622	if (port != keys->ports.dst) {
623	port = keys->ports.dst;
624	upd = true;
625	}
626	}
627	return upd;
628	#else
629	return false;
630	#endif
631	}
632
633	/ Cake has several subtle multiple bit settings. In these cases you*
634	* would be matching triple isolate mode as well.
635	*/
636
637	static bool cake_dsrc(int flow_mode)
638	{
639	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
640	}
641
642	static bool cake_ddst(int flow_mode)
643	{
644	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
645	}
646
647	static u32 cake_hash(struct cake_tin_data q, const* struct sk_buff *skb,
648	int flow_mode, u16 flow_override, u16 host_override)
649	{
650	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
651	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
652	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
653	u32 flow_hash = `0`, srchost_hash = `0`, dsthost_hash = `0`;
654	u16 reduced_hash, srchost_idx, dsthost_idx;
655	struct flow_keys keys, host_keys;
656	bool use_skbhash = skb->l4_hash;
657
658	if (unlikely(flow_mode == CAKE_FLOW_NONE))
659	return `0`;
660
661	/ If both overrides are set, or we can use the SKB hash and nat mode is*
662	* disabled, we can skip packet dissection entirely. If nat mode is
663	* enabled there's another check below after doing the conntrack lookup.
664	*/
665	if ((!hash_flows \|\| (use_skbhash && !nat_enabled)) && !hash_hosts)
666	goto skip_hash;
667
668	skb_flow_dissect_flow_keys(skb, flow: &keys,
669	FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
670
671	/ Don't use the SKB hash if we change the lookup keys from conntrack /
672	if (nat_enabled && cake_update_flowkeys(keys: &keys, skb))
673	use_skbhash = false;
674
675	/ If we can still use the SKB hash and don't need the host hash, we can*
676	* skip the rest of the hashing procedure
677	*/
678	if (use_skbhash && !hash_hosts)
679	goto skip_hash;
680
681	/ flow_hash_from_keys() sorts the addresses by value, so we have*
682	* to preserve their order in a separate data structure to treat
683	* src and dst host addresses as independently selectable.
684	*/
685	host_keys = keys;
686	host_keys.ports.ports = `0`;
687	host_keys.basic.ip_proto = `0`;
688	host_keys.keyid.keyid = `0`;
689	host_keys.tags.flow_label = `0`;
690
691	switch (host_keys.control.addr_type) {
692	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
693	host_keys.addrs.v4addrs.src = `0`;
694	dsthost_hash = flow_hash_from_keys(keys: &host_keys);
695	host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
696	host_keys.addrs.v4addrs.dst = `0`;
697	srchost_hash = flow_hash_from_keys(keys: &host_keys);
698	break;
699
700	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
701	memset(&host_keys.addrs.v6addrs.src, `0`,
702	sizeof(host_keys.addrs.v6addrs.src));
703	dsthost_hash = flow_hash_from_keys(keys: &host_keys);
704	host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
705	memset(&host_keys.addrs.v6addrs.dst, `0`,
706	sizeof(host_keys.addrs.v6addrs.dst));
707	srchost_hash = flow_hash_from_keys(keys: &host_keys);
708	break;
709
710	default:
711	dsthost_hash = `0`;
712	srchost_hash = `0`;
713	}
714
715	/ This must be after the above switch, since as a*
716	* side-effect it sorts the src and dst addresses.
717	*/
718	if (hash_flows && !use_skbhash)
719	flow_hash = flow_hash_from_keys(keys: &keys);
720
721	skip_hash:
722	if (flow_override)
723	flow_hash = flow_override - `1`;
724	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
725	flow_hash = skb->hash;
726	if (host_override) {
727	dsthost_hash = host_override - `1`;
728	srchost_hash = host_override - `1`;
729	}
730
731	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
732	if (flow_mode & CAKE_FLOW_SRC_IP)
733	flow_hash ^= srchost_hash;
734
735	if (flow_mode & CAKE_FLOW_DST_IP)
736	flow_hash ^= dsthost_hash;
737	}
738
739	reduced_hash = flow_hash % CAKE_QUEUES;
740
741	/ set-associative hashing /
742	/ fast path if no hash collision (direct lookup succeeds) /
743	if (likely(q->tags[reduced_hash] == flow_hash &&
744	q->flows[reduced_hash].set)) {
745	q->way_directs++;
746	} else {
747	u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
748	u32 outer_hash = reduced_hash - inner_hash;
749	bool allocate_src = false;
750	bool allocate_dst = false;
751	u32 i, k;
752
753	/ check if any active queue in the set is reserved for*
754	* this flow.
755	*/
756	for (i = `0`, k = inner_hash; i < CAKE_SET_WAYS;
757	i++, k = (k + `1`) % CAKE_SET_WAYS) {
758	if (q->tags[outer_hash + k] == flow_hash) {
759	if (i)
760	q->way_hits++;
761
762	if (!q->flows[outer_hash + k].set) {
763	/ need to increment host refcnts /
764	allocate_src = cake_dsrc(flow_mode);
765	allocate_dst = cake_ddst(flow_mode);
766	}
767
768	goto found;
769	}
770	}
771
772	/ no queue is reserved for this flow, look for an*
773	* empty one.
774	*/
775	for (i = `0`; i < CAKE_SET_WAYS;
776	i++, k = (k + `1`) % CAKE_SET_WAYS) {
777	if (!q->flows[outer_hash + k].set) {
778	q->way_misses++;
779	allocate_src = cake_dsrc(flow_mode);
780	allocate_dst = cake_ddst(flow_mode);
781	goto found;
782	}
783	}
784
785	/ With no empty queues, default to the original*
786	* queue, accept the collision, update the host tags.
787	*/
788	q->way_collisions++;
789	if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
790	q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
791	q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
792	}
793	allocate_src = cake_dsrc(flow_mode);
794	allocate_dst = cake_ddst(flow_mode);
795	found:
796	/ reserve queue for future packets in same flow /
797	reduced_hash = outer_hash + k;
798	q->tags[reduced_hash] = flow_hash;
799
800	if (allocate_src) {
801	srchost_idx = srchost_hash % CAKE_QUEUES;
802	inner_hash = srchost_idx % CAKE_SET_WAYS;
803	outer_hash = srchost_idx - inner_hash;
804	for (i = `0`, k = inner_hash; i < CAKE_SET_WAYS;
805	i++, k = (k + `1`) % CAKE_SET_WAYS) {
806	if (q->hosts[outer_hash + k].srchost_tag ==
807	srchost_hash)
808	goto found_src;
809	}
810	for (i = `0`; i < CAKE_SET_WAYS;
811	i++, k = (k + `1`) % CAKE_SET_WAYS) {
812	if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
813	break;
814	}
815	q->hosts[outer_hash + k].srchost_tag = srchost_hash;
816	found_src:
817	srchost_idx = outer_hash + k;
818	if (q->flows[reduced_hash].set == CAKE_SET_BULK)
819	q->hosts[srchost_idx].srchost_bulk_flow_count++;
820	q->flows[reduced_hash].srchost = srchost_idx;
821	}
822
823	if (allocate_dst) {
824	dsthost_idx = dsthost_hash % CAKE_QUEUES;
825	inner_hash = dsthost_idx % CAKE_SET_WAYS;
826	outer_hash = dsthost_idx - inner_hash;
827	for (i = `0`, k = inner_hash; i < CAKE_SET_WAYS;
828	i++, k = (k + `1`) % CAKE_SET_WAYS) {
829	if (q->hosts[outer_hash + k].dsthost_tag ==
830	dsthost_hash)
831	goto found_dst;
832	}
833	for (i = `0`; i < CAKE_SET_WAYS;
834	i++, k = (k + `1`) % CAKE_SET_WAYS) {
835	if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
836	break;
837	}
838	q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
839	found_dst:
840	dsthost_idx = outer_hash + k;
841	if (q->flows[reduced_hash].set == CAKE_SET_BULK)
842	q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
843	q->flows[reduced_hash].dsthost = dsthost_idx;
844	}
845	}
846
847	return reduced_hash;
848	}
849
850	/ helper functions : might be changed when/if skb use a standard list_head /
851	/ remove one skb from head of slot queue /
852
853	static struct sk_buff dequeue_head(struct* cake_flow *flow)
854	{
855	struct sk_buff *skb = flow->head;
856
857	if (skb) {
858	flow->head = skb->next;
859	skb_mark_not_on_list(skb);
860	}
861
862	return skb;
863	}
864
865	/ add skb to flow queue (tail add) /
866
867	static void flow_queue_add(struct cake_flow flow, struct* sk_buff *skb)
868	{
869	if (!flow->head)
870	flow->head = skb;
871	else
872	flow->tail->next = skb;
873	flow->tail = skb;
874	skb->next = NULL;
875	}
876
877	static struct iphdr cake_get_iphdr(const* struct sk_buff *skb,
878	struct ipv6hdr *buf)
879	{
880	unsigned int offset = skb_network_offset(skb);
881	struct iphdr *iph;
882
883	iph = skb_header_pointer(skb, offset, len: sizeof(struct iphdr), buffer: buf);
884
885	if (!iph)
886	return NULL;
887
888	if (iph->version == `4` && iph->protocol == IPPROTO_IPV6)
889	return skb_header_pointer(skb, offset: offset + iph->ihl * `4`,
890	len: sizeof(struct ipv6hdr), buffer: buf);
891
892	else if (iph->version == `4`)
893	return iph;
894
895	else if (iph->version == `6`)
896	return skb_header_pointer(skb, offset, len: sizeof(struct ipv6hdr),
897	buffer: buf);
898
899	return NULL;
900	}
901
902	static struct tcphdr cake_get_tcphdr(const* struct sk_buff *skb,
903	void buf, unsigned* int bufsize)
904	{
905	unsigned int offset = skb_network_offset(skb);
906	const struct ipv6hdr *ipv6h;
907	const struct tcphdr *tcph;
908	const struct iphdr *iph;
909	struct ipv6hdr _ipv6h;
910	struct tcphdr _tcph;
911
912	ipv6h = skb_header_pointer(skb, offset, len: sizeof(_ipv6h), buffer: &_ipv6h);
913
914	if (!ipv6h)
915	return NULL;
916
917	if (ipv6h->version == `4`) {
918	iph = (struct iphdr *)ipv6h;
919	offset += iph->ihl * `4`;
920
921	/ special-case 6in4 tunnelling, as that is a common way to get*
922	* v6 connectivity in the home
923	*/
924	if (iph->protocol == IPPROTO_IPV6) {
925	ipv6h = skb_header_pointer(skb, offset,
926	len: sizeof(_ipv6h), buffer: &_ipv6h);
927
928	if (!ipv6h \|\| ipv6h->nexthdr != IPPROTO_TCP)
929	return NULL;
930
931	offset += sizeof(struct ipv6hdr);
932
933	} else if (iph->protocol != IPPROTO_TCP) {
934	return NULL;
935	}
936
937	} else if (ipv6h->version == `6`) {
938	if (ipv6h->nexthdr != IPPROTO_TCP)
939	return NULL;
940
941	offset += sizeof(struct ipv6hdr);
942	} else {
943	return NULL;
944	}
945
946	tcph = skb_header_pointer(skb, offset, len: sizeof(_tcph), buffer: &_tcph);
947	if (!tcph \|\| tcph->doff < `5`)
948	return NULL;
949
950	return skb_header_pointer(skb, offset,
951	min(__tcp_hdrlen(tcph), bufsize), buffer: buf);
952	}
953
954	static const void cake_get_tcpopt(const* struct tcphdr *tcph,
955	int code, int *oplen)
956	{
957	/ inspired by tcp_parse_options in tcp_input.c /
958	int length = __tcp_hdrlen(th: tcph) - sizeof(struct tcphdr);
959	const u8 ptr = (const* u8 *)(tcph + `1`);
960
961	while (length > `0`) {
962	int opcode = *ptr++;
963	int opsize;
964
965	if (opcode == TCPOPT_EOL)
966	break;
967	if (opcode == TCPOPT_NOP) {
968	length--;
969	continue;
970	}
971	if (length < `2`)
972	break;
973	opsize = *ptr++;
974	if (opsize < `2` \|\| opsize > length)
975	break;
976
977	if (opcode == code) {
978	*oplen = opsize;
979	return ptr;
980	}
981
982	ptr += opsize - `2`;
983	length -= opsize;
984	}
985
986	return NULL;
987	}
988
989	/ Compare two SACK sequences. A sequence is considered greater if it SACKs more*
990	* bytes than the other. In the case where both sequences ACKs bytes that the
991	* other doesn't, A is considered greater. DSACKs in A also makes A be
992	* considered greater.
993	*
994	* @return -1, 0 or 1 as normal compare functions
995	*/
996	static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
997	const struct tcphdr *tcph_b)
998	{
999	const struct tcp_sack_block_wire sack_a, sack_b;
1000	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1001	u32 bytes_a = `0`, bytes_b = `0`;
1002	int oplen_a, oplen_b;
1003	bool first = true;
1004
1005	sack_a = cake_get_tcpopt(tcph: tcph_a, TCPOPT_SACK, oplen: &oplen_a);
1006	sack_b = cake_get_tcpopt(tcph: tcph_b, TCPOPT_SACK, oplen: &oplen_b);
1007
1008	/ pointers point to option contents /
1009	oplen_a -= TCPOLEN_SACK_BASE;
1010	oplen_b -= TCPOLEN_SACK_BASE;
1011
1012	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1013	(!sack_b \|\| oplen_b < sizeof(*sack_b)))
1014	return -`1`;
1015	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1016	(!sack_a \|\| oplen_a < sizeof(*sack_a)))
1017	return `1`;
1018	else if ((!sack_a \|\| oplen_a < sizeof(*sack_a)) &&
1019	(!sack_b \|\| oplen_b < sizeof(*sack_b)))
1020	return `0`;
1021
1022	while (oplen_a >= sizeof(*sack_a)) {
1023	const struct tcp_sack_block_wire *sack_tmp = sack_b;
1024	u32 start_a = get_unaligned_be32(p: &sack_a->start_seq);
1025	u32 end_a = get_unaligned_be32(p: &sack_a->end_seq);
1026	int oplen_tmp = oplen_b;
1027	bool found = false;
1028
1029	/ DSACK; always considered greater to prevent dropping /
1030	if (before(seq1: start_a, seq2: ack_seq_a))
1031	return -`1`;
1032
1033	bytes_a += end_a - start_a;
1034
1035	while (oplen_tmp >= sizeof(*sack_tmp)) {
1036	u32 start_b = get_unaligned_be32(p: &sack_tmp->start_seq);
1037	u32 end_b = get_unaligned_be32(p: &sack_tmp->end_seq);
1038
1039	/ first time through we count the total size /
1040	if (first)
1041	bytes_b += end_b - start_b;
1042
1043	if (!after(start_b, start_a) && !before(seq1: end_b, seq2: end_a)) {
1044	found = true;
1045	if (!first)
1046	break;
1047	}
1048	oplen_tmp -= sizeof(*sack_tmp);
1049	sack_tmp++;
1050	}
1051
1052	if (!found)
1053	return -`1`;
1054
1055	oplen_a -= sizeof(*sack_a);
1056	sack_a++;
1057	first = false;
1058	}
1059
1060	/ If we made it this far, all ranges SACKed by A are covered by B, so*
1061	* either the SACKs are equal, or B SACKs more bytes.
1062	*/
1063	return bytes_b > bytes_a ? `1` : `0`;
1064	}
1065
1066	static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1067	u32 tsval, u32 tsecr)
1068	{
1069	const u8 *ptr;
1070	int opsize;
1071
1072	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, oplen: &opsize);
1073
1074	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1075	*tsval = get_unaligned_be32(p: ptr);
1076	*tsecr = get_unaligned_be32(p: ptr + `4`);
1077	}
1078	}
1079
1080	static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1081	u32 tstamp_new, u32 tsecr_new)
1082	{
1083	/ inspired by tcp_parse_options in tcp_input.c /
1084	int length = __tcp_hdrlen(th: tcph) - sizeof(struct tcphdr);
1085	const u8 ptr = (const* u8 *)(tcph + `1`);
1086	u32 tstamp, tsecr;
1087
1088	/ 3 reserved flags must be unset to avoid future breakage*
1089	* ACK must be set
1090	* ECE/CWR are handled separately
1091	* All other flags URG/PSH/RST/SYN/FIN must be unset
1092	* 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1093	* 0x00C00000 = CWR/ECE (handled separately)
1094	* 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1095	*/
1096	if (((tcp_flag_word(tcph) &
1097	cpu_to_be32(`0x0F3F0000`)) != TCP_FLAG_ACK))
1098	return false;
1099
1100	while (length > `0`) {
1101	int opcode = *ptr++;
1102	int opsize;
1103
1104	if (opcode == TCPOPT_EOL)
1105	break;
1106	if (opcode == TCPOPT_NOP) {
1107	length--;
1108	continue;
1109	}
1110	if (length < `2`)
1111	break;
1112	opsize = *ptr++;
1113	if (opsize < `2` \|\| opsize > length)
1114	break;
1115
1116	switch (opcode) {
1117	case TCPOPT_MD5SIG: / doesn't influence state /
1118	break;
1119
1120	case TCPOPT_SACK: / stricter checking performed later /
1121	if (opsize % `8` != `2`)
1122	return false;
1123	break;
1124
1125	case TCPOPT_TIMESTAMP:
1126	/ only drop timestamps lower than new /
1127	if (opsize != TCPOLEN_TIMESTAMP)
1128	return false;
1129	tstamp = get_unaligned_be32(p: ptr);
1130	tsecr = get_unaligned_be32(p: ptr + `4`);
1131	if (after(tstamp, tstamp_new) \|\|
1132	after(tsecr, tsecr_new))
1133	return false;
1134	break;
1135
1136	case TCPOPT_MSS: / these should only be set on SYN /
1137	case TCPOPT_WINDOW:
1138	case TCPOPT_SACK_PERM:
1139	case TCPOPT_FASTOPEN:
1140	case TCPOPT_EXP:
1141	default: / don't drop if any unknown options are present /
1142	return false;
1143	}
1144
1145	ptr += opsize - `2`;
1146	length -= opsize;
1147	}
1148
1149	return true;
1150	}
1151
1152	static struct sk_buff cake_ack_filter(struct* cake_sched_data *q,
1153	struct cake_flow *flow)
1154	{
1155	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1156	struct sk_buff elig_ack = NULL, elig_ack_prev = NULL;
1157	struct sk_buff skb_check, skb_prev = NULL;
1158	const struct ipv6hdr ipv6h, ipv6h_check;
1159	unsigned char _tcph[`64`], _tcph_check[`64`];
1160	const struct tcphdr tcph, tcph_check;
1161	const struct iphdr iph, iph_check;
1162	struct ipv6hdr _iph, _iph_check;
1163	const struct sk_buff *skb;
1164	int seglen, num_found = `0`;
1165	u32 tstamp = `0`, tsecr = `0`;
1166	__be32 elig_flags = `0`;
1167	int sack_comp;
1168
1169	/ no other possible ACKs to filter /
1170	if (flow->head == flow->tail)
1171	return NULL;
1172
1173	skb = flow->tail;
1174	tcph = cake_get_tcphdr(skb, buf: _tcph, bufsize: sizeof(_tcph));
1175	iph = cake_get_iphdr(skb, buf: &_iph);
1176	if (!tcph)
1177	return NULL;
1178
1179	cake_tcph_get_tstamp(tcph, tsval: &tstamp, tsecr: &tsecr);
1180
1181	/ the 'triggering' packet need only have the ACK flag set.*
1182	* also check that SYN is not set, as there won't be any previous ACKs.
1183	*/
1184	if ((tcp_flag_word(tcph) &
1185	(TCP_FLAG_ACK \| TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1186	return NULL;
1187
1188	/ the 'triggering' ACK is at the tail of the queue, we have already*
1189	* returned if it is the only packet in the flow. loop through the rest
1190	* of the queue looking for pure ACKs with the same 5-tuple as the
1191	* triggering one.
1192	*/
1193	for (skb_check = flow->head;
1194	skb_check && skb_check != skb;
1195	skb_prev = skb_check, skb_check = skb_check->next) {
1196	iph_check = cake_get_iphdr(skb: skb_check, buf: &_iph_check);
1197	tcph_check = cake_get_tcphdr(skb: skb_check, buf: &_tcph_check,
1198	bufsize: sizeof(_tcph_check));
1199
1200	/ only TCP packets with matching 5-tuple are eligible, and only*
1201	* drop safe headers
1202	*/
1203	if (!tcph_check \|\| iph->version != iph_check->version \|\|
1204	tcph_check->source != tcph->source \|\|
1205	tcph_check->dest != tcph->dest)
1206	continue;
1207
1208	if (iph_check->version == `4`) {
1209	if (iph_check->saddr != iph->saddr \|\|
1210	iph_check->daddr != iph->daddr)
1211	continue;
1212
1213	seglen = iph_totlen(skb, iph: iph_check) -
1214	(`4` * iph_check->ihl);
1215	} else if (iph_check->version == `6`) {
1216	ipv6h = (struct ipv6hdr *)iph;
1217	ipv6h_check = (struct ipv6hdr *)iph_check;
1218
1219	if (ipv6_addr_cmp(a1: &ipv6h_check->saddr, a2: &ipv6h->saddr) \|\|
1220	ipv6_addr_cmp(a1: &ipv6h_check->daddr, a2: &ipv6h->daddr))
1221	continue;
1222
1223	seglen = ntohs(ipv6h_check->payload_len);
1224	} else {
1225	WARN_ON(`1`); / shouldn't happen /
1226	continue;
1227	}
1228
1229	/ If the ECE/CWR flags changed from the previous eligible*
1230	* packet in the same flow, we should no longer be dropping that
1231	* previous packet as this would lose information.
1232	*/
1233	if (elig_ack && (tcp_flag_word(tcph_check) &
1234	(TCP_FLAG_ECE \| TCP_FLAG_CWR)) != elig_flags) {
1235	elig_ack = NULL;
1236	elig_ack_prev = NULL;
1237	num_found--;
1238	}
1239
1240	/ Check TCP options and flags, don't drop ACKs with segment*
1241	* data, and don't drop ACKs with a higher cumulative ACK
1242	* counter than the triggering packet. Check ACK seqno here to
1243	* avoid parsing SACK options of packets we are going to exclude
1244	* anyway.
1245	*/
1246	if (!cake_tcph_may_drop(tcph: tcph_check, tstamp_new: tstamp, tsecr_new: tsecr) \|\|
1247	(seglen - __tcp_hdrlen(th: tcph_check)) != `0` \|\|
1248	after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1249	continue;
1250
1251	/ Check SACK options. The triggering packet must SACK more data*
1252	* than the ACK under consideration, or SACK the same range but
1253	* have a larger cumulative ACK counter. The latter is a
1254	* pathological case, but is contained in the following check
1255	* anyway, just to be safe.
1256	*/
1257	sack_comp = cake_tcph_sack_compare(tcph_a: tcph_check, tcph_b: tcph);
1258
1259	if (sack_comp < `0` \|\|
1260	(ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1261	sack_comp == `0`))
1262	continue;
1263
1264	/ At this point we have found an eligible pure ACK to drop; if*
1265	* we are in aggressive mode, we are done. Otherwise, keep
1266	* searching unless this is the second eligible ACK we
1267	* found.
1268	*
1269	* Since we want to drop ACK closest to the head of the queue,
1270	* save the first eligible ACK we find, even if we need to loop
1271	* again.
1272	*/
1273	if (!elig_ack) {
1274	elig_ack = skb_check;
1275	elig_ack_prev = skb_prev;
1276	elig_flags = (tcp_flag_word(tcph_check)
1277	& (TCP_FLAG_ECE \| TCP_FLAG_CWR));
1278	}
1279
1280	if (num_found++ > `0`)
1281	goto found;
1282	}
1283
1284	/ We made it through the queue without finding two eligible ACKs . If*
1285	* we found a single eligible ACK we can drop it in aggressive mode if
1286	* we can guarantee that this does not interfere with ECN flag
1287	* information. We ensure this by dropping it only if the enqueued
1288	* packet is consecutive with the eligible ACK, and their flags match.
1289	*/
1290	if (elig_ack && aggressive && elig_ack->next == skb &&
1291	(elig_flags == (tcp_flag_word(tcph) &
1292	(TCP_FLAG_ECE \| TCP_FLAG_CWR))))
1293	goto found;
1294
1295	return NULL;
1296
1297	found:
1298	if (elig_ack_prev)
1299	elig_ack_prev->next = elig_ack->next;
1300	else
1301	flow->head = elig_ack->next;
1302
1303	skb_mark_not_on_list(skb: elig_ack);
1304
1305	return elig_ack;
1306	}
1307
1308	static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1309	{
1310	avg -= avg >> shift;
1311	avg += sample >> shift;
1312	return avg;
1313	}
1314
1315	static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1316	{
1317	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1318	len -= off;
1319
1320	if (q->max_netlen < len)
1321	q->max_netlen = len;
1322	if (q->min_netlen > len)
1323	q->min_netlen = len;
1324
1325	len += q->rate_overhead;
1326
1327	if (len < q->rate_mpu)
1328	len = q->rate_mpu;
1329
1330	if (q->atm_mode == CAKE_ATM_ATM) {
1331	len += `47`;
1332	len /= `48`;
1333	len *= `53`;
1334	} else if (q->atm_mode == CAKE_ATM_PTM) {
1335	/ Add one byte per 64 bytes or part thereof.*
1336	* This is conservative and easier to calculate than the
1337	* precise value.
1338	*/
1339	len += (len + `63`) / `64`;
1340	}
1341
1342	if (q->max_adjlen < len)
1343	q->max_adjlen = len;
1344	if (q->min_adjlen > len)
1345	q->min_adjlen = len;
1346
1347	return len;
1348	}
1349
1350	static u32 cake_overhead(struct cake_sched_data q, const* struct sk_buff *skb)
1351	{
1352	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1353	unsigned int hdr_len, last_len = `0`;
1354	u32 off = skb_network_offset(skb);
1355	u32 len = qdisc_pkt_len(skb);
1356	u16 segs = `1`;
1357
1358	q->avg_netoff = cake_ewma(avg: q->avg_netoff, sample: off << `16`, shift: `8`);
1359
1360	if (!shinfo->gso_size)
1361	return cake_calc_overhead(q, len, off);
1362
1363	/ borrowed from qdisc_pkt_len_init() /
1364	hdr_len = skb_transport_offset(skb);
1365
1366	/ + transport layer /
1367	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 \|
1368	SKB_GSO_TCPV6))) {
1369	const struct tcphdr *th;
1370	struct tcphdr _tcphdr;
1371
1372	th = skb_header_pointer(skb, offset: hdr_len,
1373	len: sizeof(_tcphdr), buffer: &_tcphdr);
1374	if (likely(th))
1375	hdr_len += __tcp_hdrlen(th);
1376	} else {
1377	struct udphdr _udphdr;
1378
1379	if (skb_header_pointer(skb, offset: hdr_len,
1380	len: sizeof(_udphdr), buffer: &_udphdr))
1381	hdr_len += sizeof(struct udphdr);
1382	}
1383
1384	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1385	segs = DIV_ROUND_UP(skb->len - hdr_len,
1386	shinfo->gso_size);
1387	else
1388	segs = shinfo->gso_segs;
1389
1390	len = shinfo->gso_size + hdr_len;
1391	last_len = skb->len - shinfo->gso_size * (segs - `1`);
1392
1393	return (cake_calc_overhead(q, len, off) * (segs - `1`) +
1394	cake_calc_overhead(q, len: last_len, off));
1395	}
1396
1397	static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1398	{
1399	struct cake_heap_entry ii = q->overflow_heap[i];
1400	struct cake_heap_entry jj = q->overflow_heap[j];
1401
1402	q->overflow_heap[i] = jj;
1403	q->overflow_heap[j] = ii;
1404
1405	q->tins[ii.t].overflow_idx[ii.b] = j;
1406	q->tins[jj.t].overflow_idx[jj.b] = i;
1407	}
1408
1409	static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1410	{
1411	struct cake_heap_entry ii = q->overflow_heap[i];
1412
1413	return q->tins[ii.t].backlogs[ii.b];
1414	}
1415
1416	static void cake_heapify(struct cake_sched_data *q, u16 i)
1417	{
1418	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1419	u32 mb = cake_heap_get_backlog(q, i);
1420	u32 m = i;
1421
1422	while (m < a) {
1423	u32 l = m + m + `1`;
1424	u32 r = l + `1`;
1425
1426	if (l < a) {
1427	u32 lb = cake_heap_get_backlog(q, i: l);
1428
1429	if (lb > mb) {
1430	m = l;
1431	mb = lb;
1432	}
1433	}
1434
1435	if (r < a) {
1436	u32 rb = cake_heap_get_backlog(q, i: r);
1437
1438	if (rb > mb) {
1439	m = r;
1440	mb = rb;
1441	}
1442	}
1443
1444	if (m != i) {
1445	cake_heap_swap(q, i, j: m);
1446	i = m;
1447	} else {
1448	break;
1449	}
1450	}
1451	}
1452
1453	static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1454	{
1455	while (i > `0` && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1456	u16 p = (i - `1`) >> `1`;
1457	u32 ib = cake_heap_get_backlog(q, i);
1458	u32 pb = cake_heap_get_backlog(q, i: p);
1459
1460	if (ib > pb) {
1461	cake_heap_swap(q, i, j: p);
1462	i = p;
1463	} else {
1464	break;
1465	}
1466	}
1467	}
1468
1469	static int cake_advance_shaper(struct cake_sched_data *q,
1470	struct cake_tin_data *b,
1471	struct sk_buff *skb,
1472	ktime_t now, bool drop)
1473	{
1474	u32 len = get_cobalt_cb(skb)->adjusted_len;
1475
1476	/ charge packet bandwidth to this tin*
1477	* and to the global shaper.
1478	*/
1479	if (q->rate_ns) {
1480	u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1481	u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1482	u64 failsafe_dur = global_dur + (global_dur >> `1`);
1483
1484	if (ktime_before(cmp1: b->time_next_packet, cmp2: now))
1485	b->time_next_packet = ktime_add_ns(b->time_next_packet,
1486	tin_dur);
1487
1488	else if (ktime_before(cmp1: b->time_next_packet,
1489	ktime_add_ns(now, tin_dur)))
1490	b->time_next_packet = ktime_add_ns(now, tin_dur);
1491
1492	q->time_next_packet = ktime_add_ns(q->time_next_packet,
1493	global_dur);
1494	if (!drop)
1495	q->failsafe_next_packet = \
1496	ktime_add_ns(q->failsafe_next_packet,
1497	failsafe_dur);
1498	}
1499	return len;
1500	}
1501
1502	static unsigned int cake_drop(struct Qdisc sch, struct* sk_buff **to_free)
1503	{
1504	struct cake_sched_data *q = qdisc_priv(sch);
1505	ktime_t now = ktime_get();
1506	u32 idx = `0`, tin = `0`, len;
1507	struct cake_heap_entry qq;
1508	struct cake_tin_data *b;
1509	struct cake_flow *flow;
1510	struct sk_buff *skb;
1511
1512	if (!q->overflow_timeout) {
1513	int i;
1514	/ Build fresh max-heap /
1515	for (i = CAKE_MAX_TINS * CAKE_QUEUES / `2`; i >= `0`; i--)
1516	cake_heapify(q, i);
1517	}
1518	q->overflow_timeout = `65535`;
1519
1520	/ select longest queue for pruning /
1521	qq = q->overflow_heap[`0`];
1522	tin = qq.t;
1523	idx = qq.b;
1524
1525	b = &q->tins[tin];
1526	flow = &b->flows[idx];
1527	skb = dequeue_head(flow);
1528	if (unlikely(!skb)) {
1529	/ heap has gone wrong, rebuild it next time /
1530	q->overflow_timeout = `0`;
1531	return idx + (tin << `16`);
1532	}
1533
1534	if (cobalt_queue_full(vars: &flow->cvars, p: &b->cparams, now))
1535	b->unresponsive_flow_count++;
1536
1537	len = qdisc_pkt_len(skb);
1538	q->buffer_used -= skb->truesize;
1539	b->backlogs[idx] -= len;
1540	b->tin_backlog -= len;
1541	sch->qstats.backlog -= len;
1542	qdisc_tree_reduce_backlog(qdisc: sch, n: `1`, len);
1543
1544	flow->dropped++;
1545	b->tin_dropped++;
1546	sch->qstats.drops++;
1547
1548	if (q->rate_flags & CAKE_FLAG_INGRESS)
1549	cake_advance_shaper(q, b, skb, now, drop: true);
1550
1551	__qdisc_drop(skb, to_free);
1552	sch->q.qlen--;
1553
1554	cake_heapify(q, i: `0`);
1555
1556	return idx + (tin << `16`);
1557	}
1558
1559	static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1560	{
1561	const int offset = skb_network_offset(skb);
1562	u16 *buf, buf_;
1563	u8 dscp;
1564
1565	switch (skb_protocol(skb, skip_vlan: true)) {
1566	case htons(ETH_P_IP):
1567	buf = skb_header_pointer(skb, offset, len: sizeof(buf_), buffer: &buf_);
1568	if (unlikely(!buf))
1569	return `0`;
1570
1571	/ ToS is in the second byte of iphdr /
1572	dscp = ipv4_get_dsfield(iph: (struct iphdr *)buf) >> `2`;
1573
1574	if (wash && dscp) {
1575	const int wlen = offset + sizeof(struct iphdr);
1576
1577	if (!pskb_may_pull(skb, len: wlen) \|\|
1578	skb_try_make_writable(skb, write_len: wlen))
1579	return `0`;
1580
1581	ipv4_change_dsfield(iph: ip_hdr(skb), mask: INET_ECN_MASK, value: `0`);
1582	}
1583
1584	return dscp;
1585
1586	case htons(ETH_P_IPV6):
1587	buf = skb_header_pointer(skb, offset, len: sizeof(buf_), buffer: &buf_);
1588	if (unlikely(!buf))
1589	return `0`;
1590
1591	/ Traffic class is in the first and second bytes of ipv6hdr /
1592	dscp = ipv6_get_dsfield(ipv6h: (struct ipv6hdr *)buf) >> `2`;
1593
1594	if (wash && dscp) {
1595	const int wlen = offset + sizeof(struct ipv6hdr);
1596
1597	if (!pskb_may_pull(skb, len: wlen) \|\|
1598	skb_try_make_writable(skb, write_len: wlen))
1599	return `0`;
1600
1601	ipv6_change_dsfield(ipv6h: ipv6_hdr(skb), mask: INET_ECN_MASK, value: `0`);
1602	}
1603
1604	return dscp;
1605
1606	case htons(ETH_P_ARP):
1607	return `0x38`; / CS7 - Net Control /
1608
1609	default:
1610	/ If there is no Diffserv field, treat as best-effort /
1611	return `0`;
1612	}
1613	}
1614
1615	static struct cake_tin_data cake_select_tin(struct* Qdisc *sch,
1616	struct sk_buff *skb)
1617	{
1618	struct cake_sched_data *q = qdisc_priv(sch);
1619	u32 tin, mark;
1620	bool wash;
1621	u8 dscp;
1622
1623	/ Tin selection: Default to diffserv-based selection, allow overriding*
1624	* using firewall marks or skb->priority. Call DSCP parsing early if
1625	* wash is enabled, otherwise defer to below to skip unneeded parsing.
1626	*/
1627	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1628	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1629	if (wash)
1630	dscp = cake_handle_diffserv(skb, wash);
1631
1632	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1633	tin = `0`;
1634
1635	else if (mark && mark <= q->tin_cnt)
1636	tin = q->tin_order[mark - `1`];
1637
1638	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1639	TC_H_MIN(skb->priority) > `0` &&
1640	TC_H_MIN(skb->priority) <= q->tin_cnt)
1641	tin = q->tin_order[TC_H_MIN(skb->priority) - `1`];
1642
1643	else {
1644	if (!wash)
1645	dscp = cake_handle_diffserv(skb, wash);
1646	tin = q->tin_index[dscp];
1647
1648	if (unlikely(tin >= q->tin_cnt))
1649	tin = `0`;
1650	}
1651
1652	return &q->tins[tin];
1653	}
1654
1655	static u32 cake_classify(struct Qdisc sch, struct* cake_tin_data **t,
1656	struct sk_buff skb, int* flow_mode, int *qerr)
1657	{
1658	struct cake_sched_data *q = qdisc_priv(sch);
1659	struct tcf_proto *filter;
1660	struct tcf_result res;
1661	u16 flow = `0`, host = `0`;
1662	int result;
1663
1664	filter = rcu_dereference_bh(q->filter_list);
1665	if (!filter)
1666	goto hash;
1667
1668	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
1669	result = tcf_classify(skb, NULL, tp: filter, res: &res, compat_mode: false);
1670
1671	if (result >= `0`) {
1672	#ifdef CONFIG_NET_CLS_ACT
1673	switch (result) {
1674	case TC_ACT_STOLEN:
1675	case TC_ACT_QUEUED:
1676	case TC_ACT_TRAP:
1677	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_STOLEN;
1678	fallthrough;
1679	case TC_ACT_SHOT:
1680	return `0`;
1681	}
1682	#endif
1683	if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1684	flow = TC_H_MIN(res.classid);
1685	if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << `16`))
1686	host = TC_H_MAJ(res.classid) >> `16`;
1687	}
1688	hash:
1689	*t = cake_select_tin(sch, skb);
1690	return cake_hash(q: *t, skb, flow_mode, flow_override: flow, host_override: host) + `1`;
1691	}
1692
1693	static void cake_reconfigure(struct Qdisc *sch);
1694
1695	static s32 cake_enqueue(struct sk_buff skb, struct* Qdisc *sch,
1696	struct sk_buff **to_free)
1697	{
1698	struct cake_sched_data *q = qdisc_priv(sch);
1699	int len = qdisc_pkt_len(skb);
1700	int ret;
1701	struct sk_buff *ack = NULL;
1702	ktime_t now = ktime_get();
1703	struct cake_tin_data *b;
1704	struct cake_flow *flow;
1705	u32 idx;
1706
1707	/ choose flow to insert into /
1708	idx = cake_classify(sch, t: &b, skb, flow_mode: q->flow_mode, qerr: &ret);
1709	if (idx == `0`) {
1710	if (ret & __NET_XMIT_BYPASS)
1711	qdisc_qstats_drop(sch);
1712	__qdisc_drop(skb, to_free);
1713	return ret;
1714	}
1715	idx--;
1716	flow = &b->flows[idx];
1717
1718	/ ensure shaper state isn't stale /
1719	if (!b->tin_backlog) {
1720	if (ktime_before(cmp1: b->time_next_packet, cmp2: now))
1721	b->time_next_packet = now;
1722
1723	if (!sch->q.qlen) {
1724	if (ktime_before(cmp1: q->time_next_packet, cmp2: now)) {
1725	q->failsafe_next_packet = now;
1726	q->time_next_packet = now;
1727	} else if (ktime_after(cmp1: q->time_next_packet, cmp2: now) &&
1728	ktime_after(cmp1: q->failsafe_next_packet, cmp2: now)) {
1729	u64 next = \
1730	min(ktime_to_ns(q->time_next_packet),
1731	ktime_to_ns(
1732	q->failsafe_next_packet));
1733	sch->qstats.overlimits++;
1734	qdisc_watchdog_schedule_ns(wd: &q->watchdog, expires: next);
1735	}
1736	}
1737	}
1738
1739	if (unlikely(len > b->max_skblen))
1740	b->max_skblen = len;
1741
1742	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1743	struct sk_buff segs, nskb;
1744	netdev_features_t features = netif_skb_features(skb);
1745	unsigned int slen = `0`, numsegs = `0`;
1746
1747	segs = skb_gso_segment(skb, features: features & ~NETIF_F_GSO_MASK);
1748	if (IS_ERR_OR_NULL(ptr: segs))
1749	return qdisc_drop(skb, sch, to_free);
1750
1751	skb_list_walk_safe(segs, segs, nskb) {
1752	skb_mark_not_on_list(skb: segs);
1753	qdisc_skb_cb(skb: segs)->pkt_len = segs->len;
1754	cobalt_set_enqueue_time(skb: segs, now);
1755	get_cobalt_cb(skb: segs)->adjusted_len = cake_overhead(q,
1756	skb: segs);
1757	flow_queue_add(flow, skb: segs);
1758
1759	sch->q.qlen++;
1760	numsegs++;
1761	slen += segs->len;
1762	q->buffer_used += segs->truesize;
1763	b->packets++;
1764	}
1765
1766	/ stats /
1767	b->bytes += slen;
1768	b->backlogs[idx] += slen;
1769	b->tin_backlog += slen;
1770	sch->qstats.backlog += slen;
1771	q->avg_window_bytes += slen;
1772
1773	qdisc_tree_reduce_backlog(qdisc: sch, n: `1`-numsegs, len: len-slen);
1774	consume_skb(skb);
1775	} else {
1776	/ not splitting /
1777	cobalt_set_enqueue_time(skb, now);
1778	get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1779	flow_queue_add(flow, skb);
1780
1781	if (q->ack_filter)
1782	ack = cake_ack_filter(q, flow);
1783
1784	if (ack) {
1785	b->ack_drops++;
1786	sch->qstats.drops++;
1787	b->bytes += qdisc_pkt_len(skb: ack);
1788	len -= qdisc_pkt_len(skb: ack);
1789	q->buffer_used += skb->truesize - ack->truesize;
1790	if (q->rate_flags & CAKE_FLAG_INGRESS)
1791	cake_advance_shaper(q, b, skb: ack, now, drop: true);
1792
1793	qdisc_tree_reduce_backlog(qdisc: sch, n: `1`, len: qdisc_pkt_len(skb: ack));
1794	consume_skb(skb: ack);
1795	} else {
1796	sch->q.qlen++;
1797	q->buffer_used += skb->truesize;
1798	}
1799
1800	/ stats /
1801	b->packets++;
1802	b->bytes += len;
1803	b->backlogs[idx] += len;
1804	b->tin_backlog += len;
1805	sch->qstats.backlog += len;
1806	q->avg_window_bytes += len;
1807	}
1808
1809	if (q->overflow_timeout)
1810	cake_heapify_up(q, i: b->overflow_idx[idx]);
1811
1812	/ incoming bandwidth capacity estimate /
1813	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1814	u64 packet_interval = \
1815	ktime_to_ns(ktime_sub(now, q->last_packet_time));
1816
1817	if (packet_interval > NSEC_PER_SEC)
1818	packet_interval = NSEC_PER_SEC;
1819
1820	/ filter out short-term bursts, eg. wifi aggregation /
1821	q->avg_packet_interval = \
1822	cake_ewma(avg: q->avg_packet_interval,
1823	sample: packet_interval,
1824	shift: (packet_interval > q->avg_packet_interval ?
1825	`2` : `8`));
1826
1827	q->last_packet_time = now;
1828
1829	if (packet_interval > q->avg_packet_interval) {
1830	u64 window_interval = \
1831	ktime_to_ns(ktime_sub(now,
1832	q->avg_window_begin));
1833	u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1834
1835	b = div64_u64(dividend: b, divisor: window_interval);
1836	q->avg_peak_bandwidth =
1837	cake_ewma(avg: q->avg_peak_bandwidth, sample: b,
1838	shift: b > q->avg_peak_bandwidth ? `2` : `8`);
1839	q->avg_window_bytes = `0`;
1840	q->avg_window_begin = now;
1841
1842	if (ktime_after(cmp1: now,
1843	cmp2: ktime_add_ms(kt: q->last_reconfig_time,
1844	msec: `250`))) {
1845	q->rate_bps = (q->avg_peak_bandwidth * `15`) >> `4`;
1846	cake_reconfigure(sch);
1847	}
1848	}
1849	} else {
1850	q->avg_window_bytes = `0`;
1851	q->last_packet_time = now;
1852	}
1853
1854	/ flowchain /
1855	if (!flow->set \|\| flow->set == CAKE_SET_DECAYING) {
1856	struct cake_host *srchost = &b->hosts[flow->srchost];
1857	struct cake_host *dsthost = &b->hosts[flow->dsthost];
1858	u16 host_load = `1`;
1859
1860	if (!flow->set) {
1861	list_add_tail(new: &flow->flowchain, head: &b->new_flows);
1862	} else {
1863	b->decaying_flow_count--;
1864	list_move_tail(list: &flow->flowchain, head: &b->new_flows);
1865	}
1866	flow->set = CAKE_SET_SPARSE;
1867	b->sparse_flow_count++;
1868
1869	if (cake_dsrc(flow_mode: q->flow_mode))
1870	host_load = max(host_load, srchost->srchost_bulk_flow_count);
1871
1872	if (cake_ddst(flow_mode: q->flow_mode))
1873	host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1874
1875	flow->deficit = (b->flow_quantum *
1876	quantum_div[host_load]) >> `16`;
1877	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1878	struct cake_host *srchost = &b->hosts[flow->srchost];
1879	struct cake_host *dsthost = &b->hosts[flow->dsthost];
1880
1881	/ this flow was empty, accounted as a sparse flow, but actually*
1882	* in the bulk rotation.
1883	*/
1884	flow->set = CAKE_SET_BULK;
1885	b->sparse_flow_count--;
1886	b->bulk_flow_count++;
1887
1888	if (cake_dsrc(flow_mode: q->flow_mode))
1889	srchost->srchost_bulk_flow_count++;
1890
1891	if (cake_ddst(flow_mode: q->flow_mode))
1892	dsthost->dsthost_bulk_flow_count++;
1893
1894	}
1895
1896	if (q->buffer_used > q->buffer_max_used)
1897	q->buffer_max_used = q->buffer_used;
1898
1899	if (q->buffer_used > q->buffer_limit) {
1900	u32 dropped = `0`;
1901
1902	while (q->buffer_used > q->buffer_limit) {
1903	dropped++;
1904	cake_drop(sch, to_free);
1905	}
1906	b->drop_overlimit += dropped;
1907	}
1908	return NET_XMIT_SUCCESS;
1909	}
1910
1911	static struct sk_buff cake_dequeue_one(struct* Qdisc *sch)
1912	{
1913	struct cake_sched_data *q = qdisc_priv(sch);
1914	struct cake_tin_data *b = &q->tins[q->cur_tin];
1915	struct cake_flow *flow = &b->flows[q->cur_flow];
1916	struct sk_buff *skb = NULL;
1917	u32 len;
1918
1919	if (flow->head) {
1920	skb = dequeue_head(flow);
1921	len = qdisc_pkt_len(skb);
1922	b->backlogs[q->cur_flow] -= len;
1923	b->tin_backlog -= len;
1924	sch->qstats.backlog -= len;
1925	q->buffer_used -= skb->truesize;
1926	sch->q.qlen--;
1927
1928	if (q->overflow_timeout)
1929	cake_heapify(q, i: b->overflow_idx[q->cur_flow]);
1930	}
1931	return skb;
1932	}
1933
1934	/ Discard leftover packets from a tin no longer in use. /
1935	static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1936	{
1937	struct cake_sched_data *q = qdisc_priv(sch);
1938	struct sk_buff *skb;
1939
1940	q->cur_tin = tin;
1941	for (q->cur_flow = `0`; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1942	while (!!(skb = cake_dequeue_one(sch)))
1943	kfree_skb(skb);
1944	}
1945
1946	static struct sk_buff cake_dequeue(struct* Qdisc *sch)
1947	{
1948	struct cake_sched_data *q = qdisc_priv(sch);
1949	struct cake_tin_data *b = &q->tins[q->cur_tin];
1950	struct cake_host srchost, dsthost;
1951	ktime_t now = ktime_get();
1952	struct cake_flow *flow;
1953	struct list_head *head;
1954	bool first_flow = true;
1955	struct sk_buff *skb;
1956	u16 host_load;
1957	u64 delay;
1958	u32 len;
1959
1960	begin:
1961	if (!sch->q.qlen)
1962	return NULL;
1963
1964	/ global hard shaper /
1965	if (ktime_after(cmp1: q->time_next_packet, cmp2: now) &&
1966	ktime_after(cmp1: q->failsafe_next_packet, cmp2: now)) {
1967	u64 next = min(ktime_to_ns(q->time_next_packet),
1968	ktime_to_ns(q->failsafe_next_packet));
1969
1970	sch->qstats.overlimits++;
1971	qdisc_watchdog_schedule_ns(wd: &q->watchdog, expires: next);
1972	return NULL;
1973	}
1974
1975	/ Choose a class to work on. /
1976	if (!q->rate_ns) {
1977	/ In unlimited mode, can't rely on shaper timings, just balance*
1978	* with DRR
1979	*/
1980	bool wrapped = false, empty = true;
1981
1982	while (b->tin_deficit < `0` \|\|
1983	!(b->sparse_flow_count + b->bulk_flow_count)) {
1984	if (b->tin_deficit <= `0`)
1985	b->tin_deficit += b->tin_quantum;
1986	if (b->sparse_flow_count + b->bulk_flow_count)
1987	empty = false;
1988
1989	q->cur_tin++;
1990	b++;
1991	if (q->cur_tin >= q->tin_cnt) {
1992	q->cur_tin = `0`;
1993	b = q->tins;
1994
1995	if (wrapped) {
1996	/ It's possible for q->qlen to be*
1997	* nonzero when we actually have no
1998	* packets anywhere.
1999	*/
2000	if (empty)
2001	return NULL;
2002	} else {
2003	wrapped = true;
2004	}
2005	}
2006	}
2007	} else {
2008	/ In shaped mode, choose:*
2009	* - Highest-priority tin with queue and meeting schedule, or
2010	* - The earliest-scheduled tin with queue.
2011	*/
2012	ktime_t best_time = KTIME_MAX;
2013	int tin, best_tin = `0`;
2014
2015	for (tin = `0`; tin < q->tin_cnt; tin++) {
2016	b = q->tins + tin;
2017	if ((b->sparse_flow_count + b->bulk_flow_count) > `0`) {
2018	ktime_t time_to_pkt = \
2019	ktime_sub(b->time_next_packet, now);
2020
2021	if (ktime_to_ns(kt: time_to_pkt) <= `0` \|\|
2022	ktime_compare(cmp1: time_to_pkt,
2023	cmp2: best_time) <= `0`) {
2024	best_time = time_to_pkt;
2025	best_tin = tin;
2026	}
2027	}
2028	}
2029
2030	q->cur_tin = best_tin;
2031	b = q->tins + best_tin;
2032
2033	/ No point in going further if no packets to deliver. /
2034	if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2035	return NULL;
2036	}
2037
2038	retry:
2039	/ service this class /
2040	head = &b->decaying_flows;
2041	if (!first_flow \|\| list_empty(head)) {
2042	head = &b->new_flows;
2043	if (list_empty(head)) {
2044	head = &b->old_flows;
2045	if (unlikely(list_empty(head))) {
2046	head = &b->decaying_flows;
2047	if (unlikely(list_empty(head)))
2048	goto begin;
2049	}
2050	}
2051	}
2052	flow = list_first_entry(head, struct cake_flow, flowchain);
2053	q->cur_flow = flow - b->flows;
2054	first_flow = false;
2055
2056	/ triple isolation (modified DRR++) /
2057	srchost = &b->hosts[flow->srchost];
2058	dsthost = &b->hosts[flow->dsthost];
2059	host_load = `1`;
2060
2061	/ flow isolation (DRR++) /
2062	if (flow->deficit <= `0`) {
2063	/ Keep all flows with deficits out of the sparse and decaying*
2064	* rotations. No non-empty flow can go into the decaying
2065	* rotation, so they can't get deficits
2066	*/
2067	if (flow->set == CAKE_SET_SPARSE) {
2068	if (flow->head) {
2069	b->sparse_flow_count--;
2070	b->bulk_flow_count++;
2071
2072	if (cake_dsrc(flow_mode: q->flow_mode))
2073	srchost->srchost_bulk_flow_count++;
2074
2075	if (cake_ddst(flow_mode: q->flow_mode))
2076	dsthost->dsthost_bulk_flow_count++;
2077
2078	flow->set = CAKE_SET_BULK;
2079	} else {
2080	/ we've moved it to the bulk rotation for*
2081	* correct deficit accounting but we still want
2082	* to count it as a sparse flow, not a bulk one.
2083	*/
2084	flow->set = CAKE_SET_SPARSE_WAIT;
2085	}
2086	}
2087
2088	if (cake_dsrc(flow_mode: q->flow_mode))
2089	host_load = max(host_load, srchost->srchost_bulk_flow_count);
2090
2091	if (cake_ddst(flow_mode: q->flow_mode))
2092	host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2093
2094	WARN_ON(host_load > CAKE_QUEUES);
2095
2096	/ The get_random_u16() is a way to apply dithering to avoid*
2097	* accumulating roundoff errors
2098	*/
2099	flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2100	get_random_u16()) >> `16`;
2101	list_move_tail(list: &flow->flowchain, head: &b->old_flows);
2102
2103	goto retry;
2104	}
2105
2106	/ Retrieve a packet via the AQM /
2107	while (`1`) {
2108	skb = cake_dequeue_one(sch);
2109	if (!skb) {
2110	/ this queue was actually empty /
2111	if (cobalt_queue_empty(vars: &flow->cvars, p: &b->cparams, now))
2112	b->unresponsive_flow_count--;
2113
2114	if (flow->cvars.p_drop \|\| flow->cvars.count \|\|
2115	ktime_before(cmp1: now, cmp2: flow->cvars.drop_next)) {
2116	/ keep in the flowchain until the state has*
2117	* decayed to rest
2118	*/
2119	list_move_tail(list: &flow->flowchain,
2120	head: &b->decaying_flows);
2121	if (flow->set == CAKE_SET_BULK) {
2122	b->bulk_flow_count--;
2123
2124	if (cake_dsrc(flow_mode: q->flow_mode))
2125	srchost->srchost_bulk_flow_count--;
2126
2127	if (cake_ddst(flow_mode: q->flow_mode))
2128	dsthost->dsthost_bulk_flow_count--;
2129
2130	b->decaying_flow_count++;
2131	} else if (flow->set == CAKE_SET_SPARSE \|\|
2132	flow->set == CAKE_SET_SPARSE_WAIT) {
2133	b->sparse_flow_count--;
2134	b->decaying_flow_count++;
2135	}
2136	flow->set = CAKE_SET_DECAYING;
2137	} else {
2138	/ remove empty queue from the flowchain /
2139	list_del_init(entry: &flow->flowchain);
2140	if (flow->set == CAKE_SET_SPARSE \|\|
2141	flow->set == CAKE_SET_SPARSE_WAIT)
2142	b->sparse_flow_count--;
2143	else if (flow->set == CAKE_SET_BULK) {
2144	b->bulk_flow_count--;
2145
2146	if (cake_dsrc(flow_mode: q->flow_mode))
2147	srchost->srchost_bulk_flow_count--;
2148
2149	if (cake_ddst(flow_mode: q->flow_mode))
2150	dsthost->dsthost_bulk_flow_count--;
2151
2152	} else
2153	b->decaying_flow_count--;
2154
2155	flow->set = CAKE_SET_NONE;
2156	}
2157	goto begin;
2158	}
2159
2160	/ Last packet in queue may be marked, shouldn't be dropped /
2161	if (!cobalt_should_drop(vars: &flow->cvars, p: &b->cparams, now, skb,
2162	bulk_flows: (b->bulk_flow_count *
2163	!!(q->rate_flags &
2164	CAKE_FLAG_INGRESS))) \|\|
2165	!flow->head)
2166	break;
2167
2168	/ drop this packet, get another one /
2169	if (q->rate_flags & CAKE_FLAG_INGRESS) {
2170	len = cake_advance_shaper(q, b, skb,
2171	now, drop: true);
2172	flow->deficit -= len;
2173	b->tin_deficit -= len;
2174	}
2175	flow->dropped++;
2176	b->tin_dropped++;
2177	qdisc_tree_reduce_backlog(qdisc: sch, n: `1`, len: qdisc_pkt_len(skb));
2178	qdisc_qstats_drop(sch);
2179	kfree_skb(skb);
2180	if (q->rate_flags & CAKE_FLAG_INGRESS)
2181	goto retry;
2182	}
2183
2184	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2185	qdisc_bstats_update(sch, skb);
2186
2187	/ collect delay stats /
2188	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2189	b->avge_delay = cake_ewma(avg: b->avge_delay, sample: delay, shift: `8`);
2190	b->peak_delay = cake_ewma(avg: b->peak_delay, sample: delay,
2191	shift: delay > b->peak_delay ? `2` : `8`);
2192	b->base_delay = cake_ewma(avg: b->base_delay, sample: delay,
2193	shift: delay < b->base_delay ? `2` : `8`);
2194
2195	len = cake_advance_shaper(q, b, skb, now, drop: false);
2196	flow->deficit -= len;
2197	b->tin_deficit -= len;
2198
2199	if (ktime_after(cmp1: q->time_next_packet, cmp2: now) && sch->q.qlen) {
2200	u64 next = min(ktime_to_ns(q->time_next_packet),
2201	ktime_to_ns(q->failsafe_next_packet));
2202
2203	qdisc_watchdog_schedule_ns(wd: &q->watchdog, expires: next);
2204	} else if (!sch->q.qlen) {
2205	int i;
2206
2207	for (i = `0`; i < q->tin_cnt; i++) {
2208	if (q->tins[i].decaying_flow_count) {
2209	ktime_t next = \
2210	ktime_add_ns(now,
2211	q->tins[i].cparams.target);
2212
2213	qdisc_watchdog_schedule_ns(wd: &q->watchdog,
2214	expires: ktime_to_ns(kt: next));
2215	break;
2216	}
2217	}
2218	}
2219
2220	if (q->overflow_timeout)
2221	q->overflow_timeout--;
2222
2223	return skb;
2224	}
2225
2226	static void cake_reset(struct Qdisc *sch)
2227	{
2228	struct cake_sched_data *q = qdisc_priv(sch);
2229	u32 c;
2230
2231	if (!q->tins)
2232	return;
2233
2234	for (c = `0`; c < CAKE_MAX_TINS; c++)
2235	cake_clear_tin(sch, tin: c);
2236	}
2237
2238	static const struct nla_policy cake_policy[TCA_CAKE_MAX + `1`] = {
2239	[TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 },
2240	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2241	[TCA_CAKE_ATM] = { .type = NLA_U32 },
2242	[TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 },
2243	[TCA_CAKE_OVERHEAD] = { .type = NLA_S32 },
2244	[TCA_CAKE_RTT] = { .type = NLA_U32 },
2245	[TCA_CAKE_TARGET] = { .type = NLA_U32 },
2246	[TCA_CAKE_AUTORATE] = { .type = NLA_U32 },
2247	[TCA_CAKE_MEMORY] = { .type = NLA_U32 },
2248	[TCA_CAKE_NAT] = { .type = NLA_U32 },
2249	[TCA_CAKE_RAW] = { .type = NLA_U32 },
2250	[TCA_CAKE_WASH] = { .type = NLA_U32 },
2251	[TCA_CAKE_MPU] = { .type = NLA_U32 },
2252	[TCA_CAKE_INGRESS] = { .type = NLA_U32 },
2253	[TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
2254	[TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 },
2255	[TCA_CAKE_FWMARK] = { .type = NLA_U32 },
2256	};
2257
2258	static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2259	u64 target_ns, u64 rtt_est_ns)
2260	{
2261	/ convert byte-rate into time-per-byte*
2262	* so it will always unwedge in reasonable time.
2263	*/
2264	static const u64 MIN_RATE = `64`;
2265	u32 byte_target = mtu;
2266	u64 byte_target_ns;
2267	u8 rate_shft = `0`;
2268	u64 rate_ns = `0`;
2269
2270	b->flow_quantum = `1514`;
2271	if (rate) {
2272	b->flow_quantum = max(min(rate >> `12`, `1514ULL`), `300ULL`);
2273	rate_shft = `34`;
2274	rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2275	rate_ns = div64_u64(dividend: rate_ns, max(MIN_RATE, rate));
2276	while (!!(rate_ns >> `34`)) {
2277	rate_ns >>= `1`;
2278	rate_shft--;
2279	}
2280	} / else unlimited, ie. zero delay /
2281
2282	b->tin_rate_bps = rate;
2283	b->tin_rate_ns = rate_ns;
2284	b->tin_rate_shft = rate_shft;
2285
2286	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2287
2288	b->cparams.target = max((byte_target_ns * `3`) / `2`, target_ns);
2289	b->cparams.interval = max(rtt_est_ns +
2290	b->cparams.target - target_ns,
2291	b->cparams.target * `2`);
2292	b->cparams.mtu_time = byte_target_ns;
2293	b->cparams.p_inc = `1` << `24`; / 1/256 /
2294	b->cparams.p_dec = `1` << `20`; / 1/4096 /
2295	}
2296
2297	static int cake_config_besteffort(struct Qdisc *sch)
2298	{
2299	struct cake_sched_data *q = qdisc_priv(sch);
2300	struct cake_tin_data *b = &q->tins[`0`];
2301	u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch));
2302	u64 rate = q->rate_bps;
2303
2304	q->tin_cnt = `1`;
2305
2306	q->tin_index = besteffort;
2307	q->tin_order = normal_order;
2308
2309	cake_set_rate(b, rate, mtu,
2310	target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval));
2311	b->tin_quantum = `65535`;
2312
2313	return `0`;
2314	}
2315
2316	static int cake_config_precedence(struct Qdisc *sch)
2317	{
2318	/ convert high-level (user visible) parameters into internal format /
2319	struct cake_sched_data *q = qdisc_priv(sch);
2320	u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch));
2321	u64 rate = q->rate_bps;
2322	u32 quantum = `256`;
2323	u32 i;
2324
2325	q->tin_cnt = `8`;
2326	q->tin_index = precedence;
2327	q->tin_order = normal_order;
2328
2329	for (i = `0`; i < q->tin_cnt; i++) {
2330	struct cake_tin_data *b = &q->tins[i];
2331
2332	cake_set_rate(b, rate, mtu, target_ns: us_to_ns(us: q->target),
2333	rtt_est_ns: us_to_ns(us: q->interval));
2334
2335	b->tin_quantum = max_t(u16, `1U`, quantum);
2336
2337	/ calculate next class's parameters /
2338	rate *= `7`;
2339	rate >>= `3`;
2340
2341	quantum *= `7`;
2342	quantum >>= `3`;
2343	}
2344
2345	return `0`;
2346	}
2347
2348	/ List of known Diffserv codepoints:*
2349	*
2350	* Default Forwarding (DF/CS0) - Best Effort
2351	* Max Throughput (TOS2)
2352	* Min Delay (TOS4)
2353	* LLT "La" (TOS5)
2354	* Assured Forwarding 1 (AF1x) - x3
2355	* Assured Forwarding 2 (AF2x) - x3
2356	* Assured Forwarding 3 (AF3x) - x3
2357	* Assured Forwarding 4 (AF4x) - x3
2358	* Precedence Class 1 (CS1)
2359	* Precedence Class 2 (CS2)
2360	* Precedence Class 3 (CS3)
2361	* Precedence Class 4 (CS4)
2362	* Precedence Class 5 (CS5)
2363	* Precedence Class 6 (CS6)
2364	* Precedence Class 7 (CS7)
2365	* Voice Admit (VA)
2366	* Expedited Forwarding (EF)
2367	* Lower Effort (LE)
2368	*
2369	* Total 26 codepoints.
2370	*/
2371
2372	/ List of traffic classes in RFC 4594, updated by RFC 8622:*
2373	* (roughly descending order of contended priority)
2374	* (roughly ascending order of uncontended throughput)
2375	*
2376	* Network Control (CS6,CS7) - routing traffic
2377	* Telephony (EF,VA) - aka. VoIP streams
2378	* Signalling (CS5) - VoIP setup
2379	* Multimedia Conferencing (AF4x) - aka. video calls
2380	* Realtime Interactive (CS4) - eg. games
2381	* Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
2382	* Broadcast Video (CS3)
2383	* Low-Latency Data (AF2x,TOS4) - eg. database
2384	* Ops, Admin, Management (CS2) - eg. ssh
2385	* Standard Service (DF & unrecognised codepoints)
2386	* High-Throughput Data (AF1x,TOS2) - eg. web traffic
2387	* Low-Priority Data (LE,CS1) - eg. BitTorrent
2388	*
2389	* Total 12 traffic classes.
2390	*/
2391
2392	static int cake_config_diffserv8(struct Qdisc *sch)
2393	{
2394	/ Pruned list of traffic classes for typical applications:*
2395	*
2396	* Network Control (CS6, CS7)
2397	* Minimum Latency (EF, VA, CS5, CS4)
2398	* Interactive Shell (CS2)
2399	* Low Latency Transactions (AF2x, TOS4)
2400	* Video Streaming (AF4x, AF3x, CS3)
2401	* Bog Standard (DF etc.)
2402	* High Throughput (AF1x, TOS2, CS1)
2403	* Background Traffic (LE)
2404	*
2405	* Total 8 traffic classes.
2406	*/
2407
2408	struct cake_sched_data *q = qdisc_priv(sch);
2409	u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch));
2410	u64 rate = q->rate_bps;
2411	u32 quantum = `256`;
2412	u32 i;
2413
2414	q->tin_cnt = `8`;
2415
2416	/ codepoint to class mapping /
2417	q->tin_index = diffserv8;
2418	q->tin_order = normal_order;
2419
2420	/ class characteristics /
2421	for (i = `0`; i < q->tin_cnt; i++) {
2422	struct cake_tin_data *b = &q->tins[i];
2423
2424	cake_set_rate(b, rate, mtu, target_ns: us_to_ns(us: q->target),
2425	rtt_est_ns: us_to_ns(us: q->interval));
2426
2427	b->tin_quantum = max_t(u16, `1U`, quantum);
2428
2429	/ calculate next class's parameters /
2430	rate *= `7`;
2431	rate >>= `3`;
2432
2433	quantum *= `7`;
2434	quantum >>= `3`;
2435	}
2436
2437	return `0`;
2438	}
2439
2440	static int cake_config_diffserv4(struct Qdisc *sch)
2441	{
2442	/ Further pruned list of traffic classes for four-class system:*
2443	*
2444	* Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
2445	* Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2446	* Best Effort (DF, AF1x, TOS2, and those not specified)
2447	* Background Traffic (LE, CS1)
2448	*
2449	* Total 4 traffic classes.
2450	*/
2451
2452	struct cake_sched_data *q = qdisc_priv(sch);
2453	u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch));
2454	u64 rate = q->rate_bps;
2455	u32 quantum = `1024`;
2456
2457	q->tin_cnt = `4`;
2458
2459	/ codepoint to class mapping /
2460	q->tin_index = diffserv4;
2461	q->tin_order = bulk_order;
2462
2463	/ class characteristics /
2464	cake_set_rate(b: &q->tins[`0`], rate, mtu,
2465	target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval));
2466	cake_set_rate(b: &q->tins[`1`], rate: rate >> `4`, mtu,
2467	target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval));
2468	cake_set_rate(b: &q->tins[`2`], rate: rate >> `1`, mtu,
2469	target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval));
2470	cake_set_rate(b: &q->tins[`3`], rate: rate >> `2`, mtu,
2471	target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval));
2472
2473	/ bandwidth-sharing weights /
2474	q->tins[`0`].tin_quantum = quantum;
2475	q->tins[`1`].tin_quantum = quantum >> `4`;
2476	q->tins[`2`].tin_quantum = quantum >> `1`;
2477	q->tins[`3`].tin_quantum = quantum >> `2`;
2478
2479	return `0`;
2480	}
2481
2482	static int cake_config_diffserv3(struct Qdisc *sch)
2483	{
2484	/ Simplified Diffserv structure with 3 tins.*
2485	* Latency Sensitive (CS7, CS6, EF, VA, TOS4)
2486	* Best Effort
2487	* Low Priority (LE, CS1)
2488	*/
2489	struct cake_sched_data *q = qdisc_priv(sch);
2490	u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch));
2491	u64 rate = q->rate_bps;
2492	u32 quantum = `1024`;
2493
2494	q->tin_cnt = `3`;
2495
2496	/ codepoint to class mapping /
2497	q->tin_index = diffserv3;
2498	q->tin_order = bulk_order;
2499
2500	/ class characteristics /
2501	cake_set_rate(b: &q->tins[`0`], rate, mtu,
2502	target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval));
2503	cake_set_rate(b: &q->tins[`1`], rate: rate >> `4`, mtu,
2504	target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval));
2505	cake_set_rate(b: &q->tins[`2`], rate: rate >> `2`, mtu,
2506	target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval));
2507
2508	/ bandwidth-sharing weights /
2509	q->tins[`0`].tin_quantum = quantum;
2510	q->tins[`1`].tin_quantum = quantum >> `4`;
2511	q->tins[`2`].tin_quantum = quantum >> `2`;
2512
2513	return `0`;
2514	}
2515
2516	static void cake_reconfigure(struct Qdisc *sch)
2517	{
2518	struct cake_sched_data *q = qdisc_priv(sch);
2519	int c, ft;
2520
2521	switch (q->tin_mode) {
2522	case CAKE_DIFFSERV_BESTEFFORT:
2523	ft = cake_config_besteffort(sch);
2524	break;
2525
2526	case CAKE_DIFFSERV_PRECEDENCE:
2527	ft = cake_config_precedence(sch);
2528	break;
2529
2530	case CAKE_DIFFSERV_DIFFSERV8:
2531	ft = cake_config_diffserv8(sch);
2532	break;
2533
2534	case CAKE_DIFFSERV_DIFFSERV4:
2535	ft = cake_config_diffserv4(sch);
2536	break;
2537
2538	case CAKE_DIFFSERV_DIFFSERV3:
2539	default:
2540	ft = cake_config_diffserv3(sch);
2541	break;
2542	}
2543
2544	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2545	cake_clear_tin(sch, tin: c);
2546	q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2547	}
2548
2549	q->rate_ns = q->tins[ft].tin_rate_ns;
2550	q->rate_shft = q->tins[ft].tin_rate_shft;
2551
2552	if (q->buffer_config_limit) {
2553	q->buffer_limit = q->buffer_config_limit;
2554	} else if (q->rate_bps) {
2555	u64 t = q->rate_bps * q->interval;
2556
2557	do_div(t, USEC_PER_SEC / `4`);
2558	q->buffer_limit = max_t(u32, t, `4U` << `20`);
2559	} else {
2560	q->buffer_limit = ~`0`;
2561	}
2562
2563	sch->flags &= ~TCQ_F_CAN_BYPASS;
2564
2565	q->buffer_limit = min(q->buffer_limit,
2566	max(sch->limit * psched_mtu(qdisc_dev(sch)),
2567	q->buffer_config_limit));
2568	}
2569
2570	static int cake_change(struct Qdisc sch, struct* nlattr *opt,
2571	struct netlink_ext_ack *extack)
2572	{
2573	struct cake_sched_data *q = qdisc_priv(sch);
2574	struct nlattr *tb[TCA_CAKE_MAX + `1`];
2575	int err;
2576
2577	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, nla: opt, policy: cake_policy,
2578	extack);
2579	if (err < `0`)
2580	return err;
2581
2582	if (tb[TCA_CAKE_NAT]) {
2583	#if IS_ENABLED(CONFIG_NF_CONNTRACK)
2584	q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2585	q->flow_mode \|= CAKE_FLOW_NAT_FLAG *
2586	!!nla_get_u32(nla: tb[TCA_CAKE_NAT]);
2587	#else
2588	NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2589	"No conntrack support in kernel");
2590	return -EOPNOTSUPP;
2591	#endif
2592	}
2593
2594	if (tb[TCA_CAKE_BASE_RATE64])
2595	q->rate_bps = nla_get_u64(nla: tb[TCA_CAKE_BASE_RATE64]);
2596
2597	if (tb[TCA_CAKE_DIFFSERV_MODE])
2598	q->tin_mode = nla_get_u32(nla: tb[TCA_CAKE_DIFFSERV_MODE]);
2599
2600	if (tb[TCA_CAKE_WASH]) {
2601	if (!!nla_get_u32(nla: tb[TCA_CAKE_WASH]))
2602	q->rate_flags \|= CAKE_FLAG_WASH;
2603	else
2604	q->rate_flags &= ~CAKE_FLAG_WASH;
2605	}
2606
2607	if (tb[TCA_CAKE_FLOW_MODE])
2608	q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) \|
2609	(nla_get_u32(nla: tb[TCA_CAKE_FLOW_MODE]) &
2610	CAKE_FLOW_MASK));
2611
2612	if (tb[TCA_CAKE_ATM])
2613	q->atm_mode = nla_get_u32(nla: tb[TCA_CAKE_ATM]);
2614
2615	if (tb[TCA_CAKE_OVERHEAD]) {
2616	q->rate_overhead = nla_get_s32(nla: tb[TCA_CAKE_OVERHEAD]);
2617	q->rate_flags \|= CAKE_FLAG_OVERHEAD;
2618
2619	q->max_netlen = `0`;
2620	q->max_adjlen = `0`;
2621	q->min_netlen = ~`0`;
2622	q->min_adjlen = ~`0`;
2623	}
2624
2625	if (tb[TCA_CAKE_RAW]) {
2626	q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2627
2628	q->max_netlen = `0`;
2629	q->max_adjlen = `0`;
2630	q->min_netlen = ~`0`;
2631	q->min_adjlen = ~`0`;
2632	}
2633
2634	if (tb[TCA_CAKE_MPU])
2635	q->rate_mpu = nla_get_u32(nla: tb[TCA_CAKE_MPU]);
2636
2637	if (tb[TCA_CAKE_RTT]) {
2638	q->interval = nla_get_u32(nla: tb[TCA_CAKE_RTT]);
2639
2640	if (!q->interval)
2641	q->interval = `1`;
2642	}
2643
2644	if (tb[TCA_CAKE_TARGET]) {
2645	q->target = nla_get_u32(nla: tb[TCA_CAKE_TARGET]);
2646
2647	if (!q->target)
2648	q->target = `1`;
2649	}
2650
2651	if (tb[TCA_CAKE_AUTORATE]) {
2652	if (!!nla_get_u32(nla: tb[TCA_CAKE_AUTORATE]))
2653	q->rate_flags \|= CAKE_FLAG_AUTORATE_INGRESS;
2654	else
2655	q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2656	}
2657
2658	if (tb[TCA_CAKE_INGRESS]) {
2659	if (!!nla_get_u32(nla: tb[TCA_CAKE_INGRESS]))
2660	q->rate_flags \|= CAKE_FLAG_INGRESS;
2661	else
2662	q->rate_flags &= ~CAKE_FLAG_INGRESS;
2663	}
2664
2665	if (tb[TCA_CAKE_ACK_FILTER])
2666	q->ack_filter = nla_get_u32(nla: tb[TCA_CAKE_ACK_FILTER]);
2667
2668	if (tb[TCA_CAKE_MEMORY])
2669	q->buffer_config_limit = nla_get_u32(nla: tb[TCA_CAKE_MEMORY]);
2670
2671	if (tb[TCA_CAKE_SPLIT_GSO]) {
2672	if (!!nla_get_u32(nla: tb[TCA_CAKE_SPLIT_GSO]))
2673	q->rate_flags \|= CAKE_FLAG_SPLIT_GSO;
2674	else
2675	q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2676	}
2677
2678	if (tb[TCA_CAKE_FWMARK]) {
2679	q->fwmark_mask = nla_get_u32(nla: tb[TCA_CAKE_FWMARK]);
2680	q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : `0`;
2681	}
2682
2683	if (q->tins) {
2684	sch_tree_lock(q: sch);
2685	cake_reconfigure(sch);
2686	sch_tree_unlock(q: sch);
2687	}
2688
2689	return `0`;
2690	}
2691
2692	static void cake_destroy(struct Qdisc *sch)
2693	{
2694	struct cake_sched_data *q = qdisc_priv(sch);
2695
2696	qdisc_watchdog_cancel(wd: &q->watchdog);
2697	tcf_block_put(block: q->block);
2698	kvfree(addr: q->tins);
2699	}
2700
2701	static int cake_init(struct Qdisc sch, struct* nlattr *opt,
2702	struct netlink_ext_ack *extack)
2703	{
2704	struct cake_sched_data *q = qdisc_priv(sch);
2705	int i, j, err;
2706
2707	sch->limit = `10240`;
2708	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2709	q->flow_mode = CAKE_FLOW_TRIPLE;
2710
2711	q->rate_bps = `0`; / unlimited by default /
2712
2713	q->interval = `100000`; / 100ms default /
2714	q->target = `5000`; / 5ms: codel RFC argues*
2715	* for 5 to 10% of interval
2716	*/
2717	q->rate_flags \|= CAKE_FLAG_SPLIT_GSO;
2718	q->cur_tin = `0`;
2719	q->cur_flow = `0`;
2720
2721	qdisc_watchdog_init(wd: &q->watchdog, qdisc: sch);
2722
2723	if (opt) {
2724	err = cake_change(sch, opt, extack);
2725
2726	if (err)
2727	return err;
2728	}
2729
2730	err = tcf_block_get(p_block: &q->block, p_filter_chain: &q->filter_list, q: sch, extack);
2731	if (err)
2732	return err;
2733
2734	quantum_div[`0`] = ~`0`;
2735	for (i = `1`; i <= CAKE_QUEUES; i++)
2736	quantum_div[i] = `65535` / i;
2737
2738	q->tins = kvcalloc(CAKE_MAX_TINS, size: sizeof(struct cake_tin_data),
2739	GFP_KERNEL);
2740	if (!q->tins)
2741	return -ENOMEM;
2742
2743	for (i = `0`; i < CAKE_MAX_TINS; i++) {
2744	struct cake_tin_data *b = q->tins + i;
2745
2746	INIT_LIST_HEAD(list: &b->new_flows);
2747	INIT_LIST_HEAD(list: &b->old_flows);
2748	INIT_LIST_HEAD(list: &b->decaying_flows);
2749	b->sparse_flow_count = `0`;
2750	b->bulk_flow_count = `0`;
2751	b->decaying_flow_count = `0`;
2752
2753	for (j = `0`; j < CAKE_QUEUES; j++) {
2754	struct cake_flow *flow = b->flows + j;
2755	u32 k = j * CAKE_MAX_TINS + i;
2756
2757	INIT_LIST_HEAD(list: &flow->flowchain);
2758	cobalt_vars_init(vars: &flow->cvars);
2759
2760	q->overflow_heap[k].t = i;
2761	q->overflow_heap[k].b = j;
2762	b->overflow_idx[j] = k;
2763	}
2764	}
2765
2766	cake_reconfigure(sch);
2767	q->avg_peak_bandwidth = q->rate_bps;
2768	q->min_netlen = ~`0`;
2769	q->min_adjlen = ~`0`;
2770	return `0`;
2771	}
2772
2773	static int cake_dump(struct Qdisc sch, struct* sk_buff *skb)
2774	{
2775	struct cake_sched_data *q = qdisc_priv(sch);
2776	struct nlattr *opts;
2777
2778	opts = nla_nest_start_noflag(skb, attrtype: TCA_OPTIONS);
2779	if (!opts)
2780	goto nla_put_failure;
2781
2782	if (nla_put_u64_64bit(skb, attrtype: TCA_CAKE_BASE_RATE64, value: q->rate_bps,
2783	padattr: TCA_CAKE_PAD))
2784	goto nla_put_failure;
2785
2786	if (nla_put_u32(skb, attrtype: TCA_CAKE_FLOW_MODE,
2787	value: q->flow_mode & CAKE_FLOW_MASK))
2788	goto nla_put_failure;
2789
2790	if (nla_put_u32(skb, attrtype: TCA_CAKE_RTT, value: q->interval))
2791	goto nla_put_failure;
2792
2793	if (nla_put_u32(skb, attrtype: TCA_CAKE_TARGET, value: q->target))
2794	goto nla_put_failure;
2795
2796	if (nla_put_u32(skb, attrtype: TCA_CAKE_MEMORY, value: q->buffer_config_limit))
2797	goto nla_put_failure;
2798
2799	if (nla_put_u32(skb, attrtype: TCA_CAKE_AUTORATE,
2800	value: !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2801	goto nla_put_failure;
2802
2803	if (nla_put_u32(skb, attrtype: TCA_CAKE_INGRESS,
2804	value: !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2805	goto nla_put_failure;
2806
2807	if (nla_put_u32(skb, attrtype: TCA_CAKE_ACK_FILTER, value: q->ack_filter))
2808	goto nla_put_failure;
2809
2810	if (nla_put_u32(skb, attrtype: TCA_CAKE_NAT,
2811	value: !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2812	goto nla_put_failure;
2813
2814	if (nla_put_u32(skb, attrtype: TCA_CAKE_DIFFSERV_MODE, value: q->tin_mode))
2815	goto nla_put_failure;
2816
2817	if (nla_put_u32(skb, attrtype: TCA_CAKE_WASH,
2818	value: !!(q->rate_flags & CAKE_FLAG_WASH)))
2819	goto nla_put_failure;
2820
2821	if (nla_put_u32(skb, attrtype: TCA_CAKE_OVERHEAD, value: q->rate_overhead))
2822	goto nla_put_failure;
2823
2824	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2825	if (nla_put_u32(skb, attrtype: TCA_CAKE_RAW, value: `0`))
2826	goto nla_put_failure;
2827
2828	if (nla_put_u32(skb, attrtype: TCA_CAKE_ATM, value: q->atm_mode))
2829	goto nla_put_failure;
2830
2831	if (nla_put_u32(skb, attrtype: TCA_CAKE_MPU, value: q->rate_mpu))
2832	goto nla_put_failure;
2833
2834	if (nla_put_u32(skb, attrtype: TCA_CAKE_SPLIT_GSO,
2835	value: !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2836	goto nla_put_failure;
2837
2838	if (nla_put_u32(skb, attrtype: TCA_CAKE_FWMARK, value: q->fwmark_mask))
2839	goto nla_put_failure;
2840
2841	return nla_nest_end(skb, start: opts);
2842
2843	nla_put_failure:
2844	return -`1`;
2845	}
2846
2847	static int cake_dump_stats(struct Qdisc sch, struct* gnet_dump *d)
2848	{
2849	struct nlattr *stats = nla_nest_start_noflag(skb: d->skb, attrtype: TCA_STATS_APP);
2850	struct cake_sched_data *q = qdisc_priv(sch);
2851	struct nlattr tstats, ts;
2852	int i;
2853
2854	if (!stats)
2855	return -`1`;
2856
2857	#define PUT_STAT_U32(attr, data) do { \
2858	if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2859	goto nla_put_failure; \
2860	} while (0)
2861	#define PUT_STAT_U64(attr, data) do { \
2862	if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2863	data, TCA_CAKE_STATS_PAD)) \
2864	goto nla_put_failure; \
2865	} while (0)
2866
2867	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2868	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2869	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2870	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + `0x8000`) >> `16`));
2871	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2872	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2873	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2874	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2875
2876	#undef PUT_STAT_U32
2877	#undef PUT_STAT_U64
2878
2879	tstats = nla_nest_start_noflag(skb: d->skb, attrtype: TCA_CAKE_STATS_TIN_STATS);
2880	if (!tstats)
2881	goto nla_put_failure;
2882
2883	#define PUT_TSTAT_U32(attr, data) do { \
2884	if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2885	goto nla_put_failure; \
2886	} while (0)
2887	#define PUT_TSTAT_U64(attr, data) do { \
2888	if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2889	data, TCA_CAKE_TIN_STATS_PAD)) \
2890	goto nla_put_failure; \
2891	} while (0)
2892
2893	for (i = `0`; i < q->tin_cnt; i++) {
2894	struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2895
2896	ts = nla_nest_start_noflag(skb: d->skb, attrtype: i + `1`);
2897	if (!ts)
2898	goto nla_put_failure;
2899
2900	PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2901	PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2902	PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2903
2904	PUT_TSTAT_U32(TARGET_US,
2905	ktime_to_us(ns_to_ktime(b->cparams.target)));
2906	PUT_TSTAT_U32(INTERVAL_US,
2907	ktime_to_us(ns_to_ktime(b->cparams.interval)));
2908
2909	PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2910	PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2911	PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2912	PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2913
2914	PUT_TSTAT_U32(PEAK_DELAY_US,
2915	ktime_to_us(ns_to_ktime(b->peak_delay)));
2916	PUT_TSTAT_U32(AVG_DELAY_US,
2917	ktime_to_us(ns_to_ktime(b->avge_delay)));
2918	PUT_TSTAT_U32(BASE_DELAY_US,
2919	ktime_to_us(ns_to_ktime(b->base_delay)));
2920
2921	PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2922	PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2923	PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2924
2925	PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2926	b->decaying_flow_count);
2927	PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2928	PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2929	PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2930
2931	PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2932	nla_nest_end(skb: d->skb, start: ts);
2933	}
2934
2935	#undef PUT_TSTAT_U32
2936	#undef PUT_TSTAT_U64
2937
2938	nla_nest_end(skb: d->skb, start: tstats);
2939	return nla_nest_end(skb: d->skb, start: stats);
2940
2941	nla_put_failure:
2942	nla_nest_cancel(skb: d->skb, start: stats);
2943	return -`1`;
2944	}
2945
2946	static struct Qdisc cake_leaf(struct* Qdisc sch, unsigned* long arg)
2947	{
2948	return NULL;
2949	}
2950
2951	static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2952	{
2953	return `0`;
2954	}
2955
2956	static unsigned long cake_bind(struct Qdisc sch, unsigned* long parent,
2957	u32 classid)
2958	{
2959	return `0`;
2960	}
2961
2962	static void cake_unbind(struct Qdisc q, unsigned* long cl)
2963	{
2964	}
2965
2966	static struct tcf_block cake_tcf_block(struct* Qdisc sch, unsigned* long cl,
2967	struct netlink_ext_ack *extack)
2968	{
2969	struct cake_sched_data *q = qdisc_priv(sch);
2970
2971	if (cl)
2972	return NULL;
2973	return q->block;
2974	}
2975
2976	static int cake_dump_class(struct Qdisc sch, unsigned* long cl,
2977	struct sk_buff skb, struct* tcmsg *tcm)
2978	{
2979	tcm->tcm_handle \|= TC_H_MIN(cl);
2980	return `0`;
2981	}
2982
2983	static int cake_dump_class_stats(struct Qdisc sch, unsigned* long cl,
2984	struct gnet_dump *d)
2985	{
2986	struct cake_sched_data *q = qdisc_priv(sch);
2987	const struct cake_flow *flow = NULL;
2988	struct gnet_stats_queue qs = { `0` };
2989	struct nlattr *stats;
2990	u32 idx = cl - `1`;
2991
2992	if (idx < CAKE_QUEUES * q->tin_cnt) {
2993	const struct cake_tin_data *b = \
2994	&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2995	const struct sk_buff *skb;
2996
2997	flow = &b->flows[idx % CAKE_QUEUES];
2998
2999	if (flow->head) {
3000	sch_tree_lock(q: sch);
3001	skb = flow->head;
3002	while (skb) {
3003	qs.qlen++;
3004	skb = skb->next;
3005	}
3006	sch_tree_unlock(q: sch);
3007	}
3008	qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3009	qs.drops = flow->dropped;
3010	}
3011	if (gnet_stats_copy_queue(d, NULL, q: &qs, qlen: qs.qlen) < `0`)
3012	return -`1`;
3013	if (flow) {
3014	ktime_t now = ktime_get();
3015
3016	stats = nla_nest_start_noflag(skb: d->skb, attrtype: TCA_STATS_APP);
3017	if (!stats)
3018	return -`1`;
3019
3020	#define PUT_STAT_U32(attr, data) do { \
3021	if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3022	goto nla_put_failure; \
3023	} while (0)
3024	#define PUT_STAT_S32(attr, data) do { \
3025	if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3026	goto nla_put_failure; \
3027	} while (0)
3028
3029	PUT_STAT_S32(DEFICIT, flow->deficit);
3030	PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3031	PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3032	PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3033	if (flow->cvars.p_drop) {
3034	PUT_STAT_S32(BLUE_TIMER_US,
3035	ktime_to_us(
3036	ktime_sub(now,
3037	flow->cvars.blue_timer)));
3038	}
3039	if (flow->cvars.dropping) {
3040	PUT_STAT_S32(DROP_NEXT_US,
3041	ktime_to_us(
3042	ktime_sub(now,
3043	flow->cvars.drop_next)));
3044	}
3045
3046	if (nla_nest_end(skb: d->skb, start: stats) < `0`)
3047	return -`1`;
3048	}
3049
3050	return `0`;
3051
3052	nla_put_failure:
3053	nla_nest_cancel(skb: d->skb, start: stats);
3054	return -`1`;
3055	}
3056
3057	static void cake_walk(struct Qdisc sch, struct* qdisc_walker *arg)
3058	{
3059	struct cake_sched_data *q = qdisc_priv(sch);
3060	unsigned int i, j;
3061
3062	if (arg->stop)
3063	return;
3064
3065	for (i = `0`; i < q->tin_cnt; i++) {
3066	struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3067
3068	for (j = `0`; j < CAKE_QUEUES; j++) {
3069	if (list_empty(head: &b->flows[j].flowchain)) {
3070	arg->count++;
3071	continue;
3072	}
3073	if (!tc_qdisc_stats_dump(sch, cl: i * CAKE_QUEUES + j + `1`,
3074	arg))
3075	break;
3076	}
3077	}
3078	}
3079
3080	static const struct Qdisc_class_ops cake_class_ops = {
3081	.leaf = cake_leaf,
3082	.find = cake_find,
3083	.tcf_block = cake_tcf_block,
3084	.bind_tcf = cake_bind,
3085	.unbind_tcf = cake_unbind,
3086	.dump = cake_dump_class,
3087	.dump_stats = cake_dump_class_stats,
3088	.walk = cake_walk,
3089	};
3090
3091	static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3092	.cl_ops = &cake_class_ops,
3093	.id = "cake",
3094	.priv_size = sizeof(struct cake_sched_data),
3095	.enqueue = cake_enqueue,
3096	.dequeue = cake_dequeue,
3097	.peek = qdisc_peek_dequeued,
3098	.init = cake_init,
3099	.reset = cake_reset,
3100	.destroy = cake_destroy,
3101	.change = cake_change,
3102	.dump = cake_dump,
3103	.dump_stats = cake_dump_stats,
3104	.owner = THIS_MODULE,
3105	};
3106
3107	static int __init cake_module_init(void)
3108	{
3109	return register_qdisc(qops: &cake_qdisc_ops);
3110	}
3111
3112	static void __exit cake_module_exit(void)
3113	{
3114	unregister_qdisc(qops: &cake_qdisc_ops);
3115	}
3116
3117	module_init(cake_module_init)
3118	module_exit(cake_module_exit)
3119	MODULE_AUTHOR("Jonathan Morton");
3120	MODULE_LICENSE("Dual BSD/GPL");
3121	MODULE_DESCRIPTION("The CAKE shaper.");
3122

source code of linux/net/sched/sch_cake.c