1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
2 | |
3 | /* COMMON Applications Kept Enhanced (CAKE) discipline |
4 | * |
5 | * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> |
6 | * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> |
7 | * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> |
8 | * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> |
9 | * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> |
10 | * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> |
11 | * |
12 | * The CAKE Principles: |
13 | * (or, how to have your cake and eat it too) |
14 | * |
15 | * This is a combination of several shaping, AQM and FQ techniques into one |
16 | * easy-to-use package: |
17 | * |
18 | * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE |
19 | * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), |
20 | * eliminating the need for any sort of burst parameter (eg. token bucket |
21 | * depth). Burst support is limited to that necessary to overcome scheduling |
22 | * latency. |
23 | * |
24 | * - A Diffserv-aware priority queue, giving more priority to certain classes, |
25 | * up to a specified fraction of bandwidth. Above that bandwidth threshold, |
26 | * the priority is reduced to avoid starving other tins. |
27 | * |
28 | * - Each priority tin has a separate Flow Queue system, to isolate traffic |
29 | * flows from each other. This prevents a burst on one flow from increasing |
30 | * the delay to another. Flows are distributed to queues using a |
31 | * set-associative hash function. |
32 | * |
33 | * - Each queue is actively managed by Cobalt, which is a combination of the |
34 | * Codel and Blue AQM algorithms. This serves flows fairly, and signals |
35 | * congestion early via ECN (if available) and/or packet drops, to keep |
36 | * latency low. The codel parameters are auto-tuned based on the bandwidth |
37 | * setting, as is necessary at low bandwidths. |
38 | * |
39 | * The configuration parameters are kept deliberately simple for ease of use. |
40 | * Everything has sane defaults. Complete generality of configuration is *not* |
41 | * a goal. |
42 | * |
43 | * The priority queue operates according to a weighted DRR scheme, combined with |
44 | * a bandwidth tracker which reuses the shaper logic to detect which side of the |
45 | * bandwidth sharing threshold the tin is operating. This determines whether a |
46 | * priority-based weight (high) or a bandwidth-based weight (low) is used for |
47 | * that tin in the current pass. |
48 | * |
49 | * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly |
50 | * granted us permission to leverage. |
51 | */ |
52 | |
53 | #include <linux/module.h> |
54 | #include <linux/types.h> |
55 | #include <linux/kernel.h> |
56 | #include <linux/jiffies.h> |
57 | #include <linux/string.h> |
58 | #include <linux/in.h> |
59 | #include <linux/errno.h> |
60 | #include <linux/init.h> |
61 | #include <linux/skbuff.h> |
62 | #include <linux/jhash.h> |
63 | #include <linux/slab.h> |
64 | #include <linux/vmalloc.h> |
65 | #include <linux/reciprocal_div.h> |
66 | #include <net/netlink.h> |
67 | #include <linux/if_vlan.h> |
68 | #include <net/gso.h> |
69 | #include <net/pkt_sched.h> |
70 | #include <net/pkt_cls.h> |
71 | #include <net/tcp.h> |
72 | #include <net/flow_dissector.h> |
73 | |
74 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
75 | #include <net/netfilter/nf_conntrack_core.h> |
76 | #endif |
77 | |
78 | #define CAKE_SET_WAYS (8) |
79 | #define CAKE_MAX_TINS (8) |
80 | #define CAKE_QUEUES (1024) |
81 | #define CAKE_FLOW_MASK 63 |
82 | #define CAKE_FLOW_NAT_FLAG 64 |
83 | |
84 | /* struct cobalt_params - contains codel and blue parameters |
85 | * @interval: codel initial drop rate |
86 | * @target: maximum persistent sojourn time & blue update rate |
87 | * @mtu_time: serialisation delay of maximum-size packet |
88 | * @p_inc: increment of blue drop probability (0.32 fxp) |
89 | * @p_dec: decrement of blue drop probability (0.32 fxp) |
90 | */ |
91 | struct cobalt_params { |
92 | u64 interval; |
93 | u64 target; |
94 | u64 mtu_time; |
95 | u32 p_inc; |
96 | u32 p_dec; |
97 | }; |
98 | |
99 | /* struct cobalt_vars - contains codel and blue variables |
100 | * @count: codel dropping frequency |
101 | * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 |
102 | * @drop_next: time to drop next packet, or when we dropped last |
103 | * @blue_timer: Blue time to next drop |
104 | * @p_drop: BLUE drop probability (0.32 fxp) |
105 | * @dropping: set if in dropping state |
106 | * @ecn_marked: set if marked |
107 | */ |
108 | struct cobalt_vars { |
109 | u32 count; |
110 | u32 rec_inv_sqrt; |
111 | ktime_t drop_next; |
112 | ktime_t blue_timer; |
113 | u32 p_drop; |
114 | bool dropping; |
115 | bool ecn_marked; |
116 | }; |
117 | |
118 | enum { |
119 | CAKE_SET_NONE = 0, |
120 | CAKE_SET_SPARSE, |
121 | CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ |
122 | CAKE_SET_BULK, |
123 | CAKE_SET_DECAYING |
124 | }; |
125 | |
126 | struct cake_flow { |
127 | /* this stuff is all needed per-flow at dequeue time */ |
128 | struct sk_buff *head; |
129 | struct sk_buff *tail; |
130 | struct list_head flowchain; |
131 | s32 deficit; |
132 | u32 dropped; |
133 | struct cobalt_vars cvars; |
134 | u16 srchost; /* index into cake_host table */ |
135 | u16 dsthost; |
136 | u8 set; |
137 | }; /* please try to keep this structure <= 64 bytes */ |
138 | |
139 | struct cake_host { |
140 | u32 srchost_tag; |
141 | u32 dsthost_tag; |
142 | u16 srchost_bulk_flow_count; |
143 | u16 dsthost_bulk_flow_count; |
144 | }; |
145 | |
146 | struct cake_heap_entry { |
147 | u16 t:3, b:10; |
148 | }; |
149 | |
150 | struct cake_tin_data { |
151 | struct cake_flow flows[CAKE_QUEUES]; |
152 | u32 backlogs[CAKE_QUEUES]; |
153 | u32 tags[CAKE_QUEUES]; /* for set association */ |
154 | u16 overflow_idx[CAKE_QUEUES]; |
155 | struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ |
156 | u16 flow_quantum; |
157 | |
158 | struct cobalt_params cparams; |
159 | u32 drop_overlimit; |
160 | u16 bulk_flow_count; |
161 | u16 sparse_flow_count; |
162 | u16 decaying_flow_count; |
163 | u16 unresponsive_flow_count; |
164 | |
165 | u32 max_skblen; |
166 | |
167 | struct list_head new_flows; |
168 | struct list_head old_flows; |
169 | struct list_head decaying_flows; |
170 | |
171 | /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ |
172 | ktime_t time_next_packet; |
173 | u64 tin_rate_ns; |
174 | u64 tin_rate_bps; |
175 | u16 tin_rate_shft; |
176 | |
177 | u16 tin_quantum; |
178 | s32 tin_deficit; |
179 | u32 tin_backlog; |
180 | u32 tin_dropped; |
181 | u32 tin_ecn_mark; |
182 | |
183 | u32 packets; |
184 | u64 bytes; |
185 | |
186 | u32 ack_drops; |
187 | |
188 | /* moving averages */ |
189 | u64 avge_delay; |
190 | u64 peak_delay; |
191 | u64 base_delay; |
192 | |
193 | /* hash function stats */ |
194 | u32 way_directs; |
195 | u32 way_hits; |
196 | u32 way_misses; |
197 | u32 way_collisions; |
198 | }; /* number of tins is small, so size of this struct doesn't matter much */ |
199 | |
200 | struct cake_sched_data { |
201 | struct tcf_proto __rcu *filter_list; /* optional external classifier */ |
202 | struct tcf_block *block; |
203 | struct cake_tin_data *tins; |
204 | |
205 | struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; |
206 | u16 overflow_timeout; |
207 | |
208 | u16 tin_cnt; |
209 | u8 tin_mode; |
210 | u8 flow_mode; |
211 | u8 ack_filter; |
212 | u8 atm_mode; |
213 | |
214 | u32 fwmark_mask; |
215 | u16 fwmark_shft; |
216 | |
217 | /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ |
218 | u16 rate_shft; |
219 | ktime_t time_next_packet; |
220 | ktime_t failsafe_next_packet; |
221 | u64 rate_ns; |
222 | u64 rate_bps; |
223 | u16 rate_flags; |
224 | s16 rate_overhead; |
225 | u16 rate_mpu; |
226 | u64 interval; |
227 | u64 target; |
228 | |
229 | /* resource tracking */ |
230 | u32 buffer_used; |
231 | u32 buffer_max_used; |
232 | u32 buffer_limit; |
233 | u32 buffer_config_limit; |
234 | |
235 | /* indices for dequeue */ |
236 | u16 cur_tin; |
237 | u16 cur_flow; |
238 | |
239 | struct qdisc_watchdog watchdog; |
240 | const u8 *tin_index; |
241 | const u8 *tin_order; |
242 | |
243 | /* bandwidth capacity estimate */ |
244 | ktime_t last_packet_time; |
245 | ktime_t avg_window_begin; |
246 | u64 avg_packet_interval; |
247 | u64 avg_window_bytes; |
248 | u64 avg_peak_bandwidth; |
249 | ktime_t last_reconfig_time; |
250 | |
251 | /* packet length stats */ |
252 | u32 avg_netoff; |
253 | u16 max_netlen; |
254 | u16 max_adjlen; |
255 | u16 min_netlen; |
256 | u16 min_adjlen; |
257 | }; |
258 | |
259 | enum { |
260 | CAKE_FLAG_OVERHEAD = BIT(0), |
261 | CAKE_FLAG_AUTORATE_INGRESS = BIT(1), |
262 | CAKE_FLAG_INGRESS = BIT(2), |
263 | CAKE_FLAG_WASH = BIT(3), |
264 | CAKE_FLAG_SPLIT_GSO = BIT(4) |
265 | }; |
266 | |
267 | /* COBALT operates the Codel and BLUE algorithms in parallel, in order to |
268 | * obtain the best features of each. Codel is excellent on flows which |
269 | * respond to congestion signals in a TCP-like way. BLUE is more effective on |
270 | * unresponsive flows. |
271 | */ |
272 | |
273 | struct cobalt_skb_cb { |
274 | ktime_t enqueue_time; |
275 | u32 adjusted_len; |
276 | }; |
277 | |
278 | static u64 us_to_ns(u64 us) |
279 | { |
280 | return us * NSEC_PER_USEC; |
281 | } |
282 | |
283 | static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) |
284 | { |
285 | qdisc_cb_private_validate(skb, sz: sizeof(struct cobalt_skb_cb)); |
286 | return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; |
287 | } |
288 | |
289 | static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) |
290 | { |
291 | return get_cobalt_cb(skb)->enqueue_time; |
292 | } |
293 | |
294 | static void cobalt_set_enqueue_time(struct sk_buff *skb, |
295 | ktime_t now) |
296 | { |
297 | get_cobalt_cb(skb)->enqueue_time = now; |
298 | } |
299 | |
300 | static u16 quantum_div[CAKE_QUEUES + 1] = {0}; |
301 | |
302 | /* Diffserv lookup tables */ |
303 | |
304 | static const u8 precedence[] = { |
305 | 0, 0, 0, 0, 0, 0, 0, 0, |
306 | 1, 1, 1, 1, 1, 1, 1, 1, |
307 | 2, 2, 2, 2, 2, 2, 2, 2, |
308 | 3, 3, 3, 3, 3, 3, 3, 3, |
309 | 4, 4, 4, 4, 4, 4, 4, 4, |
310 | 5, 5, 5, 5, 5, 5, 5, 5, |
311 | 6, 6, 6, 6, 6, 6, 6, 6, |
312 | 7, 7, 7, 7, 7, 7, 7, 7, |
313 | }; |
314 | |
315 | static const u8 diffserv8[] = { |
316 | 2, 0, 1, 2, 4, 2, 2, 2, |
317 | 1, 2, 1, 2, 1, 2, 1, 2, |
318 | 5, 2, 4, 2, 4, 2, 4, 2, |
319 | 3, 2, 3, 2, 3, 2, 3, 2, |
320 | 6, 2, 3, 2, 3, 2, 3, 2, |
321 | 6, 2, 2, 2, 6, 2, 6, 2, |
322 | 7, 2, 2, 2, 2, 2, 2, 2, |
323 | 7, 2, 2, 2, 2, 2, 2, 2, |
324 | }; |
325 | |
326 | static const u8 diffserv4[] = { |
327 | 0, 1, 0, 0, 2, 0, 0, 0, |
328 | 1, 0, 0, 0, 0, 0, 0, 0, |
329 | 2, 0, 2, 0, 2, 0, 2, 0, |
330 | 2, 0, 2, 0, 2, 0, 2, 0, |
331 | 3, 0, 2, 0, 2, 0, 2, 0, |
332 | 3, 0, 0, 0, 3, 0, 3, 0, |
333 | 3, 0, 0, 0, 0, 0, 0, 0, |
334 | 3, 0, 0, 0, 0, 0, 0, 0, |
335 | }; |
336 | |
337 | static const u8 diffserv3[] = { |
338 | 0, 1, 0, 0, 2, 0, 0, 0, |
339 | 1, 0, 0, 0, 0, 0, 0, 0, |
340 | 0, 0, 0, 0, 0, 0, 0, 0, |
341 | 0, 0, 0, 0, 0, 0, 0, 0, |
342 | 0, 0, 0, 0, 0, 0, 0, 0, |
343 | 0, 0, 0, 0, 2, 0, 2, 0, |
344 | 2, 0, 0, 0, 0, 0, 0, 0, |
345 | 2, 0, 0, 0, 0, 0, 0, 0, |
346 | }; |
347 | |
348 | static const u8 besteffort[] = { |
349 | 0, 0, 0, 0, 0, 0, 0, 0, |
350 | 0, 0, 0, 0, 0, 0, 0, 0, |
351 | 0, 0, 0, 0, 0, 0, 0, 0, |
352 | 0, 0, 0, 0, 0, 0, 0, 0, |
353 | 0, 0, 0, 0, 0, 0, 0, 0, |
354 | 0, 0, 0, 0, 0, 0, 0, 0, |
355 | 0, 0, 0, 0, 0, 0, 0, 0, |
356 | 0, 0, 0, 0, 0, 0, 0, 0, |
357 | }; |
358 | |
359 | /* tin priority order for stats dumping */ |
360 | |
361 | static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; |
362 | static const u8 bulk_order[] = {1, 0, 2, 3}; |
363 | |
364 | #define REC_INV_SQRT_CACHE (16) |
365 | static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; |
366 | |
367 | /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots |
368 | * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) |
369 | * |
370 | * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 |
371 | */ |
372 | |
373 | static void cobalt_newton_step(struct cobalt_vars *vars) |
374 | { |
375 | u32 invsqrt, invsqrt2; |
376 | u64 val; |
377 | |
378 | invsqrt = vars->rec_inv_sqrt; |
379 | invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; |
380 | val = (3LL << 32) - ((u64)vars->count * invsqrt2); |
381 | |
382 | val >>= 2; /* avoid overflow in following multiply */ |
383 | val = (val * invsqrt) >> (32 - 2 + 1); |
384 | |
385 | vars->rec_inv_sqrt = val; |
386 | } |
387 | |
388 | static void cobalt_invsqrt(struct cobalt_vars *vars) |
389 | { |
390 | if (vars->count < REC_INV_SQRT_CACHE) |
391 | vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; |
392 | else |
393 | cobalt_newton_step(vars); |
394 | } |
395 | |
396 | /* There is a big difference in timing between the accurate values placed in |
397 | * the cache and the approximations given by a single Newton step for small |
398 | * count values, particularly when stepping from count 1 to 2 or vice versa. |
399 | * Above 16, a single Newton step gives sufficient accuracy in either |
400 | * direction, given the precision stored. |
401 | * |
402 | * The magnitude of the error when stepping up to count 2 is such as to give |
403 | * the value that *should* have been produced at count 4. |
404 | */ |
405 | |
406 | static void cobalt_cache_init(void) |
407 | { |
408 | struct cobalt_vars v; |
409 | |
410 | memset(&v, 0, sizeof(v)); |
411 | v.rec_inv_sqrt = ~0U; |
412 | cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; |
413 | |
414 | for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { |
415 | cobalt_newton_step(vars: &v); |
416 | cobalt_newton_step(vars: &v); |
417 | cobalt_newton_step(vars: &v); |
418 | cobalt_newton_step(vars: &v); |
419 | |
420 | cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; |
421 | } |
422 | } |
423 | |
424 | static void cobalt_vars_init(struct cobalt_vars *vars) |
425 | { |
426 | memset(vars, 0, sizeof(*vars)); |
427 | |
428 | if (!cobalt_rec_inv_sqrt_cache[0]) { |
429 | cobalt_cache_init(); |
430 | cobalt_rec_inv_sqrt_cache[0] = ~0; |
431 | } |
432 | } |
433 | |
434 | /* CoDel control_law is t + interval/sqrt(count) |
435 | * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid |
436 | * both sqrt() and divide operation. |
437 | */ |
438 | static ktime_t cobalt_control(ktime_t t, |
439 | u64 interval, |
440 | u32 rec_inv_sqrt) |
441 | { |
442 | return ktime_add_ns(t, reciprocal_scale(interval, |
443 | rec_inv_sqrt)); |
444 | } |
445 | |
446 | /* Call this when a packet had to be dropped due to queue overflow. Returns |
447 | * true if the BLUE state was quiescent before but active after this call. |
448 | */ |
449 | static bool cobalt_queue_full(struct cobalt_vars *vars, |
450 | struct cobalt_params *p, |
451 | ktime_t now) |
452 | { |
453 | bool up = false; |
454 | |
455 | if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { |
456 | up = !vars->p_drop; |
457 | vars->p_drop += p->p_inc; |
458 | if (vars->p_drop < p->p_inc) |
459 | vars->p_drop = ~0; |
460 | vars->blue_timer = now; |
461 | } |
462 | vars->dropping = true; |
463 | vars->drop_next = now; |
464 | if (!vars->count) |
465 | vars->count = 1; |
466 | |
467 | return up; |
468 | } |
469 | |
470 | /* Call this when the queue was serviced but turned out to be empty. Returns |
471 | * true if the BLUE state was active before but quiescent after this call. |
472 | */ |
473 | static bool cobalt_queue_empty(struct cobalt_vars *vars, |
474 | struct cobalt_params *p, |
475 | ktime_t now) |
476 | { |
477 | bool down = false; |
478 | |
479 | if (vars->p_drop && |
480 | ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { |
481 | if (vars->p_drop < p->p_dec) |
482 | vars->p_drop = 0; |
483 | else |
484 | vars->p_drop -= p->p_dec; |
485 | vars->blue_timer = now; |
486 | down = !vars->p_drop; |
487 | } |
488 | vars->dropping = false; |
489 | |
490 | if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { |
491 | vars->count--; |
492 | cobalt_invsqrt(vars); |
493 | vars->drop_next = cobalt_control(t: vars->drop_next, |
494 | interval: p->interval, |
495 | rec_inv_sqrt: vars->rec_inv_sqrt); |
496 | } |
497 | |
498 | return down; |
499 | } |
500 | |
501 | /* Call this with a freshly dequeued packet for possible congestion marking. |
502 | * Returns true as an instruction to drop the packet, false for delivery. |
503 | */ |
504 | static bool cobalt_should_drop(struct cobalt_vars *vars, |
505 | struct cobalt_params *p, |
506 | ktime_t now, |
507 | struct sk_buff *skb, |
508 | u32 bulk_flows) |
509 | { |
510 | bool next_due, over_target, drop = false; |
511 | ktime_t schedule; |
512 | u64 sojourn; |
513 | |
514 | /* The 'schedule' variable records, in its sign, whether 'now' is before or |
515 | * after 'drop_next'. This allows 'drop_next' to be updated before the next |
516 | * scheduling decision is actually branched, without destroying that |
517 | * information. Similarly, the first 'schedule' value calculated is preserved |
518 | * in the boolean 'next_due'. |
519 | * |
520 | * As for 'drop_next', we take advantage of the fact that 'interval' is both |
521 | * the delay between first exceeding 'target' and the first signalling event, |
522 | * *and* the scaling factor for the signalling frequency. It's therefore very |
523 | * natural to use a single mechanism for both purposes, and eliminates a |
524 | * significant amount of reference Codel's spaghetti code. To help with this, |
525 | * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close |
526 | * as possible to 1.0 in fixed-point. |
527 | */ |
528 | |
529 | sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); |
530 | schedule = ktime_sub(now, vars->drop_next); |
531 | over_target = sojourn > p->target && |
532 | sojourn > p->mtu_time * bulk_flows * 2 && |
533 | sojourn > p->mtu_time * 4; |
534 | next_due = vars->count && ktime_to_ns(kt: schedule) >= 0; |
535 | |
536 | vars->ecn_marked = false; |
537 | |
538 | if (over_target) { |
539 | if (!vars->dropping) { |
540 | vars->dropping = true; |
541 | vars->drop_next = cobalt_control(t: now, |
542 | interval: p->interval, |
543 | rec_inv_sqrt: vars->rec_inv_sqrt); |
544 | } |
545 | if (!vars->count) |
546 | vars->count = 1; |
547 | } else if (vars->dropping) { |
548 | vars->dropping = false; |
549 | } |
550 | |
551 | if (next_due && vars->dropping) { |
552 | /* Use ECN mark if possible, otherwise drop */ |
553 | drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); |
554 | |
555 | vars->count++; |
556 | if (!vars->count) |
557 | vars->count--; |
558 | cobalt_invsqrt(vars); |
559 | vars->drop_next = cobalt_control(t: vars->drop_next, |
560 | interval: p->interval, |
561 | rec_inv_sqrt: vars->rec_inv_sqrt); |
562 | schedule = ktime_sub(now, vars->drop_next); |
563 | } else { |
564 | while (next_due) { |
565 | vars->count--; |
566 | cobalt_invsqrt(vars); |
567 | vars->drop_next = cobalt_control(t: vars->drop_next, |
568 | interval: p->interval, |
569 | rec_inv_sqrt: vars->rec_inv_sqrt); |
570 | schedule = ktime_sub(now, vars->drop_next); |
571 | next_due = vars->count && ktime_to_ns(kt: schedule) >= 0; |
572 | } |
573 | } |
574 | |
575 | /* Simple BLUE implementation. Lack of ECN is deliberate. */ |
576 | if (vars->p_drop) |
577 | drop |= (get_random_u32() < vars->p_drop); |
578 | |
579 | /* Overload the drop_next field as an activity timeout */ |
580 | if (!vars->count) |
581 | vars->drop_next = ktime_add_ns(now, p->interval); |
582 | else if (ktime_to_ns(kt: schedule) > 0 && !drop) |
583 | vars->drop_next = now; |
584 | |
585 | return drop; |
586 | } |
587 | |
588 | static bool cake_update_flowkeys(struct flow_keys *keys, |
589 | const struct sk_buff *skb) |
590 | { |
591 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
592 | struct nf_conntrack_tuple tuple = {}; |
593 | bool rev = !skb->_nfct, upd = false; |
594 | __be32 ip; |
595 | |
596 | if (skb_protocol(skb, skip_vlan: true) != htons(ETH_P_IP)) |
597 | return false; |
598 | |
599 | if (!nf_ct_get_tuple_skb(dst_tuple: &tuple, skb)) |
600 | return false; |
601 | |
602 | ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; |
603 | if (ip != keys->addrs.v4addrs.src) { |
604 | keys->addrs.v4addrs.src = ip; |
605 | upd = true; |
606 | } |
607 | ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; |
608 | if (ip != keys->addrs.v4addrs.dst) { |
609 | keys->addrs.v4addrs.dst = ip; |
610 | upd = true; |
611 | } |
612 | |
613 | if (keys->ports.ports) { |
614 | __be16 port; |
615 | |
616 | port = rev ? tuple.dst.u.all : tuple.src.u.all; |
617 | if (port != keys->ports.src) { |
618 | keys->ports.src = port; |
619 | upd = true; |
620 | } |
621 | port = rev ? tuple.src.u.all : tuple.dst.u.all; |
622 | if (port != keys->ports.dst) { |
623 | port = keys->ports.dst; |
624 | upd = true; |
625 | } |
626 | } |
627 | return upd; |
628 | #else |
629 | return false; |
630 | #endif |
631 | } |
632 | |
633 | /* Cake has several subtle multiple bit settings. In these cases you |
634 | * would be matching triple isolate mode as well. |
635 | */ |
636 | |
637 | static bool cake_dsrc(int flow_mode) |
638 | { |
639 | return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; |
640 | } |
641 | |
642 | static bool cake_ddst(int flow_mode) |
643 | { |
644 | return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; |
645 | } |
646 | |
647 | static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, |
648 | int flow_mode, u16 flow_override, u16 host_override) |
649 | { |
650 | bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); |
651 | bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); |
652 | bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); |
653 | u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; |
654 | u16 reduced_hash, srchost_idx, dsthost_idx; |
655 | struct flow_keys keys, host_keys; |
656 | bool use_skbhash = skb->l4_hash; |
657 | |
658 | if (unlikely(flow_mode == CAKE_FLOW_NONE)) |
659 | return 0; |
660 | |
661 | /* If both overrides are set, or we can use the SKB hash and nat mode is |
662 | * disabled, we can skip packet dissection entirely. If nat mode is |
663 | * enabled there's another check below after doing the conntrack lookup. |
664 | */ |
665 | if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) |
666 | goto skip_hash; |
667 | |
668 | skb_flow_dissect_flow_keys(skb, flow: &keys, |
669 | FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); |
670 | |
671 | /* Don't use the SKB hash if we change the lookup keys from conntrack */ |
672 | if (nat_enabled && cake_update_flowkeys(keys: &keys, skb)) |
673 | use_skbhash = false; |
674 | |
675 | /* If we can still use the SKB hash and don't need the host hash, we can |
676 | * skip the rest of the hashing procedure |
677 | */ |
678 | if (use_skbhash && !hash_hosts) |
679 | goto skip_hash; |
680 | |
681 | /* flow_hash_from_keys() sorts the addresses by value, so we have |
682 | * to preserve their order in a separate data structure to treat |
683 | * src and dst host addresses as independently selectable. |
684 | */ |
685 | host_keys = keys; |
686 | host_keys.ports.ports = 0; |
687 | host_keys.basic.ip_proto = 0; |
688 | host_keys.keyid.keyid = 0; |
689 | host_keys.tags.flow_label = 0; |
690 | |
691 | switch (host_keys.control.addr_type) { |
692 | case FLOW_DISSECTOR_KEY_IPV4_ADDRS: |
693 | host_keys.addrs.v4addrs.src = 0; |
694 | dsthost_hash = flow_hash_from_keys(keys: &host_keys); |
695 | host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; |
696 | host_keys.addrs.v4addrs.dst = 0; |
697 | srchost_hash = flow_hash_from_keys(keys: &host_keys); |
698 | break; |
699 | |
700 | case FLOW_DISSECTOR_KEY_IPV6_ADDRS: |
701 | memset(&host_keys.addrs.v6addrs.src, 0, |
702 | sizeof(host_keys.addrs.v6addrs.src)); |
703 | dsthost_hash = flow_hash_from_keys(keys: &host_keys); |
704 | host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; |
705 | memset(&host_keys.addrs.v6addrs.dst, 0, |
706 | sizeof(host_keys.addrs.v6addrs.dst)); |
707 | srchost_hash = flow_hash_from_keys(keys: &host_keys); |
708 | break; |
709 | |
710 | default: |
711 | dsthost_hash = 0; |
712 | srchost_hash = 0; |
713 | } |
714 | |
715 | /* This *must* be after the above switch, since as a |
716 | * side-effect it sorts the src and dst addresses. |
717 | */ |
718 | if (hash_flows && !use_skbhash) |
719 | flow_hash = flow_hash_from_keys(keys: &keys); |
720 | |
721 | skip_hash: |
722 | if (flow_override) |
723 | flow_hash = flow_override - 1; |
724 | else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) |
725 | flow_hash = skb->hash; |
726 | if (host_override) { |
727 | dsthost_hash = host_override - 1; |
728 | srchost_hash = host_override - 1; |
729 | } |
730 | |
731 | if (!(flow_mode & CAKE_FLOW_FLOWS)) { |
732 | if (flow_mode & CAKE_FLOW_SRC_IP) |
733 | flow_hash ^= srchost_hash; |
734 | |
735 | if (flow_mode & CAKE_FLOW_DST_IP) |
736 | flow_hash ^= dsthost_hash; |
737 | } |
738 | |
739 | reduced_hash = flow_hash % CAKE_QUEUES; |
740 | |
741 | /* set-associative hashing */ |
742 | /* fast path if no hash collision (direct lookup succeeds) */ |
743 | if (likely(q->tags[reduced_hash] == flow_hash && |
744 | q->flows[reduced_hash].set)) { |
745 | q->way_directs++; |
746 | } else { |
747 | u32 inner_hash = reduced_hash % CAKE_SET_WAYS; |
748 | u32 outer_hash = reduced_hash - inner_hash; |
749 | bool allocate_src = false; |
750 | bool allocate_dst = false; |
751 | u32 i, k; |
752 | |
753 | /* check if any active queue in the set is reserved for |
754 | * this flow. |
755 | */ |
756 | for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; |
757 | i++, k = (k + 1) % CAKE_SET_WAYS) { |
758 | if (q->tags[outer_hash + k] == flow_hash) { |
759 | if (i) |
760 | q->way_hits++; |
761 | |
762 | if (!q->flows[outer_hash + k].set) { |
763 | /* need to increment host refcnts */ |
764 | allocate_src = cake_dsrc(flow_mode); |
765 | allocate_dst = cake_ddst(flow_mode); |
766 | } |
767 | |
768 | goto found; |
769 | } |
770 | } |
771 | |
772 | /* no queue is reserved for this flow, look for an |
773 | * empty one. |
774 | */ |
775 | for (i = 0; i < CAKE_SET_WAYS; |
776 | i++, k = (k + 1) % CAKE_SET_WAYS) { |
777 | if (!q->flows[outer_hash + k].set) { |
778 | q->way_misses++; |
779 | allocate_src = cake_dsrc(flow_mode); |
780 | allocate_dst = cake_ddst(flow_mode); |
781 | goto found; |
782 | } |
783 | } |
784 | |
785 | /* With no empty queues, default to the original |
786 | * queue, accept the collision, update the host tags. |
787 | */ |
788 | q->way_collisions++; |
789 | if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { |
790 | q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; |
791 | q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; |
792 | } |
793 | allocate_src = cake_dsrc(flow_mode); |
794 | allocate_dst = cake_ddst(flow_mode); |
795 | found: |
796 | /* reserve queue for future packets in same flow */ |
797 | reduced_hash = outer_hash + k; |
798 | q->tags[reduced_hash] = flow_hash; |
799 | |
800 | if (allocate_src) { |
801 | srchost_idx = srchost_hash % CAKE_QUEUES; |
802 | inner_hash = srchost_idx % CAKE_SET_WAYS; |
803 | outer_hash = srchost_idx - inner_hash; |
804 | for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; |
805 | i++, k = (k + 1) % CAKE_SET_WAYS) { |
806 | if (q->hosts[outer_hash + k].srchost_tag == |
807 | srchost_hash) |
808 | goto found_src; |
809 | } |
810 | for (i = 0; i < CAKE_SET_WAYS; |
811 | i++, k = (k + 1) % CAKE_SET_WAYS) { |
812 | if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) |
813 | break; |
814 | } |
815 | q->hosts[outer_hash + k].srchost_tag = srchost_hash; |
816 | found_src: |
817 | srchost_idx = outer_hash + k; |
818 | if (q->flows[reduced_hash].set == CAKE_SET_BULK) |
819 | q->hosts[srchost_idx].srchost_bulk_flow_count++; |
820 | q->flows[reduced_hash].srchost = srchost_idx; |
821 | } |
822 | |
823 | if (allocate_dst) { |
824 | dsthost_idx = dsthost_hash % CAKE_QUEUES; |
825 | inner_hash = dsthost_idx % CAKE_SET_WAYS; |
826 | outer_hash = dsthost_idx - inner_hash; |
827 | for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; |
828 | i++, k = (k + 1) % CAKE_SET_WAYS) { |
829 | if (q->hosts[outer_hash + k].dsthost_tag == |
830 | dsthost_hash) |
831 | goto found_dst; |
832 | } |
833 | for (i = 0; i < CAKE_SET_WAYS; |
834 | i++, k = (k + 1) % CAKE_SET_WAYS) { |
835 | if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) |
836 | break; |
837 | } |
838 | q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; |
839 | found_dst: |
840 | dsthost_idx = outer_hash + k; |
841 | if (q->flows[reduced_hash].set == CAKE_SET_BULK) |
842 | q->hosts[dsthost_idx].dsthost_bulk_flow_count++; |
843 | q->flows[reduced_hash].dsthost = dsthost_idx; |
844 | } |
845 | } |
846 | |
847 | return reduced_hash; |
848 | } |
849 | |
850 | /* helper functions : might be changed when/if skb use a standard list_head */ |
851 | /* remove one skb from head of slot queue */ |
852 | |
853 | static struct sk_buff *dequeue_head(struct cake_flow *flow) |
854 | { |
855 | struct sk_buff *skb = flow->head; |
856 | |
857 | if (skb) { |
858 | flow->head = skb->next; |
859 | skb_mark_not_on_list(skb); |
860 | } |
861 | |
862 | return skb; |
863 | } |
864 | |
865 | /* add skb to flow queue (tail add) */ |
866 | |
867 | static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) |
868 | { |
869 | if (!flow->head) |
870 | flow->head = skb; |
871 | else |
872 | flow->tail->next = skb; |
873 | flow->tail = skb; |
874 | skb->next = NULL; |
875 | } |
876 | |
877 | static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, |
878 | struct ipv6hdr *buf) |
879 | { |
880 | unsigned int offset = skb_network_offset(skb); |
881 | struct iphdr *iph; |
882 | |
883 | iph = skb_header_pointer(skb, offset, len: sizeof(struct iphdr), buffer: buf); |
884 | |
885 | if (!iph) |
886 | return NULL; |
887 | |
888 | if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) |
889 | return skb_header_pointer(skb, offset: offset + iph->ihl * 4, |
890 | len: sizeof(struct ipv6hdr), buffer: buf); |
891 | |
892 | else if (iph->version == 4) |
893 | return iph; |
894 | |
895 | else if (iph->version == 6) |
896 | return skb_header_pointer(skb, offset, len: sizeof(struct ipv6hdr), |
897 | buffer: buf); |
898 | |
899 | return NULL; |
900 | } |
901 | |
902 | static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, |
903 | void *buf, unsigned int bufsize) |
904 | { |
905 | unsigned int offset = skb_network_offset(skb); |
906 | const struct ipv6hdr *ipv6h; |
907 | const struct tcphdr *tcph; |
908 | const struct iphdr *iph; |
909 | struct ipv6hdr _ipv6h; |
910 | struct tcphdr _tcph; |
911 | |
912 | ipv6h = skb_header_pointer(skb, offset, len: sizeof(_ipv6h), buffer: &_ipv6h); |
913 | |
914 | if (!ipv6h) |
915 | return NULL; |
916 | |
917 | if (ipv6h->version == 4) { |
918 | iph = (struct iphdr *)ipv6h; |
919 | offset += iph->ihl * 4; |
920 | |
921 | /* special-case 6in4 tunnelling, as that is a common way to get |
922 | * v6 connectivity in the home |
923 | */ |
924 | if (iph->protocol == IPPROTO_IPV6) { |
925 | ipv6h = skb_header_pointer(skb, offset, |
926 | len: sizeof(_ipv6h), buffer: &_ipv6h); |
927 | |
928 | if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) |
929 | return NULL; |
930 | |
931 | offset += sizeof(struct ipv6hdr); |
932 | |
933 | } else if (iph->protocol != IPPROTO_TCP) { |
934 | return NULL; |
935 | } |
936 | |
937 | } else if (ipv6h->version == 6) { |
938 | if (ipv6h->nexthdr != IPPROTO_TCP) |
939 | return NULL; |
940 | |
941 | offset += sizeof(struct ipv6hdr); |
942 | } else { |
943 | return NULL; |
944 | } |
945 | |
946 | tcph = skb_header_pointer(skb, offset, len: sizeof(_tcph), buffer: &_tcph); |
947 | if (!tcph || tcph->doff < 5) |
948 | return NULL; |
949 | |
950 | return skb_header_pointer(skb, offset, |
951 | min(__tcp_hdrlen(tcph), bufsize), buffer: buf); |
952 | } |
953 | |
954 | static const void *cake_get_tcpopt(const struct tcphdr *tcph, |
955 | int code, int *oplen) |
956 | { |
957 | /* inspired by tcp_parse_options in tcp_input.c */ |
958 | int length = __tcp_hdrlen(th: tcph) - sizeof(struct tcphdr); |
959 | const u8 *ptr = (const u8 *)(tcph + 1); |
960 | |
961 | while (length > 0) { |
962 | int opcode = *ptr++; |
963 | int opsize; |
964 | |
965 | if (opcode == TCPOPT_EOL) |
966 | break; |
967 | if (opcode == TCPOPT_NOP) { |
968 | length--; |
969 | continue; |
970 | } |
971 | if (length < 2) |
972 | break; |
973 | opsize = *ptr++; |
974 | if (opsize < 2 || opsize > length) |
975 | break; |
976 | |
977 | if (opcode == code) { |
978 | *oplen = opsize; |
979 | return ptr; |
980 | } |
981 | |
982 | ptr += opsize - 2; |
983 | length -= opsize; |
984 | } |
985 | |
986 | return NULL; |
987 | } |
988 | |
989 | /* Compare two SACK sequences. A sequence is considered greater if it SACKs more |
990 | * bytes than the other. In the case where both sequences ACKs bytes that the |
991 | * other doesn't, A is considered greater. DSACKs in A also makes A be |
992 | * considered greater. |
993 | * |
994 | * @return -1, 0 or 1 as normal compare functions |
995 | */ |
996 | static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, |
997 | const struct tcphdr *tcph_b) |
998 | { |
999 | const struct tcp_sack_block_wire *sack_a, *sack_b; |
1000 | u32 ack_seq_a = ntohl(tcph_a->ack_seq); |
1001 | u32 bytes_a = 0, bytes_b = 0; |
1002 | int oplen_a, oplen_b; |
1003 | bool first = true; |
1004 | |
1005 | sack_a = cake_get_tcpopt(tcph: tcph_a, TCPOPT_SACK, oplen: &oplen_a); |
1006 | sack_b = cake_get_tcpopt(tcph: tcph_b, TCPOPT_SACK, oplen: &oplen_b); |
1007 | |
1008 | /* pointers point to option contents */ |
1009 | oplen_a -= TCPOLEN_SACK_BASE; |
1010 | oplen_b -= TCPOLEN_SACK_BASE; |
1011 | |
1012 | if (sack_a && oplen_a >= sizeof(*sack_a) && |
1013 | (!sack_b || oplen_b < sizeof(*sack_b))) |
1014 | return -1; |
1015 | else if (sack_b && oplen_b >= sizeof(*sack_b) && |
1016 | (!sack_a || oplen_a < sizeof(*sack_a))) |
1017 | return 1; |
1018 | else if ((!sack_a || oplen_a < sizeof(*sack_a)) && |
1019 | (!sack_b || oplen_b < sizeof(*sack_b))) |
1020 | return 0; |
1021 | |
1022 | while (oplen_a >= sizeof(*sack_a)) { |
1023 | const struct tcp_sack_block_wire *sack_tmp = sack_b; |
1024 | u32 start_a = get_unaligned_be32(p: &sack_a->start_seq); |
1025 | u32 end_a = get_unaligned_be32(p: &sack_a->end_seq); |
1026 | int oplen_tmp = oplen_b; |
1027 | bool found = false; |
1028 | |
1029 | /* DSACK; always considered greater to prevent dropping */ |
1030 | if (before(seq1: start_a, seq2: ack_seq_a)) |
1031 | return -1; |
1032 | |
1033 | bytes_a += end_a - start_a; |
1034 | |
1035 | while (oplen_tmp >= sizeof(*sack_tmp)) { |
1036 | u32 start_b = get_unaligned_be32(p: &sack_tmp->start_seq); |
1037 | u32 end_b = get_unaligned_be32(p: &sack_tmp->end_seq); |
1038 | |
1039 | /* first time through we count the total size */ |
1040 | if (first) |
1041 | bytes_b += end_b - start_b; |
1042 | |
1043 | if (!after(start_b, start_a) && !before(seq1: end_b, seq2: end_a)) { |
1044 | found = true; |
1045 | if (!first) |
1046 | break; |
1047 | } |
1048 | oplen_tmp -= sizeof(*sack_tmp); |
1049 | sack_tmp++; |
1050 | } |
1051 | |
1052 | if (!found) |
1053 | return -1; |
1054 | |
1055 | oplen_a -= sizeof(*sack_a); |
1056 | sack_a++; |
1057 | first = false; |
1058 | } |
1059 | |
1060 | /* If we made it this far, all ranges SACKed by A are covered by B, so |
1061 | * either the SACKs are equal, or B SACKs more bytes. |
1062 | */ |
1063 | return bytes_b > bytes_a ? 1 : 0; |
1064 | } |
1065 | |
1066 | static void cake_tcph_get_tstamp(const struct tcphdr *tcph, |
1067 | u32 *tsval, u32 *tsecr) |
1068 | { |
1069 | const u8 *ptr; |
1070 | int opsize; |
1071 | |
1072 | ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, oplen: &opsize); |
1073 | |
1074 | if (ptr && opsize == TCPOLEN_TIMESTAMP) { |
1075 | *tsval = get_unaligned_be32(p: ptr); |
1076 | *tsecr = get_unaligned_be32(p: ptr + 4); |
1077 | } |
1078 | } |
1079 | |
1080 | static bool cake_tcph_may_drop(const struct tcphdr *tcph, |
1081 | u32 tstamp_new, u32 tsecr_new) |
1082 | { |
1083 | /* inspired by tcp_parse_options in tcp_input.c */ |
1084 | int length = __tcp_hdrlen(th: tcph) - sizeof(struct tcphdr); |
1085 | const u8 *ptr = (const u8 *)(tcph + 1); |
1086 | u32 tstamp, tsecr; |
1087 | |
1088 | /* 3 reserved flags must be unset to avoid future breakage |
1089 | * ACK must be set |
1090 | * ECE/CWR are handled separately |
1091 | * All other flags URG/PSH/RST/SYN/FIN must be unset |
1092 | * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) |
1093 | * 0x00C00000 = CWR/ECE (handled separately) |
1094 | * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 |
1095 | */ |
1096 | if (((tcp_flag_word(tcph) & |
1097 | cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) |
1098 | return false; |
1099 | |
1100 | while (length > 0) { |
1101 | int opcode = *ptr++; |
1102 | int opsize; |
1103 | |
1104 | if (opcode == TCPOPT_EOL) |
1105 | break; |
1106 | if (opcode == TCPOPT_NOP) { |
1107 | length--; |
1108 | continue; |
1109 | } |
1110 | if (length < 2) |
1111 | break; |
1112 | opsize = *ptr++; |
1113 | if (opsize < 2 || opsize > length) |
1114 | break; |
1115 | |
1116 | switch (opcode) { |
1117 | case TCPOPT_MD5SIG: /* doesn't influence state */ |
1118 | break; |
1119 | |
1120 | case TCPOPT_SACK: /* stricter checking performed later */ |
1121 | if (opsize % 8 != 2) |
1122 | return false; |
1123 | break; |
1124 | |
1125 | case TCPOPT_TIMESTAMP: |
1126 | /* only drop timestamps lower than new */ |
1127 | if (opsize != TCPOLEN_TIMESTAMP) |
1128 | return false; |
1129 | tstamp = get_unaligned_be32(p: ptr); |
1130 | tsecr = get_unaligned_be32(p: ptr + 4); |
1131 | if (after(tstamp, tstamp_new) || |
1132 | after(tsecr, tsecr_new)) |
1133 | return false; |
1134 | break; |
1135 | |
1136 | case TCPOPT_MSS: /* these should only be set on SYN */ |
1137 | case TCPOPT_WINDOW: |
1138 | case TCPOPT_SACK_PERM: |
1139 | case TCPOPT_FASTOPEN: |
1140 | case TCPOPT_EXP: |
1141 | default: /* don't drop if any unknown options are present */ |
1142 | return false; |
1143 | } |
1144 | |
1145 | ptr += opsize - 2; |
1146 | length -= opsize; |
1147 | } |
1148 | |
1149 | return true; |
1150 | } |
1151 | |
1152 | static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, |
1153 | struct cake_flow *flow) |
1154 | { |
1155 | bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; |
1156 | struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; |
1157 | struct sk_buff *skb_check, *skb_prev = NULL; |
1158 | const struct ipv6hdr *ipv6h, *ipv6h_check; |
1159 | unsigned char _tcph[64], _tcph_check[64]; |
1160 | const struct tcphdr *tcph, *tcph_check; |
1161 | const struct iphdr *iph, *iph_check; |
1162 | struct ipv6hdr _iph, _iph_check; |
1163 | const struct sk_buff *skb; |
1164 | int seglen, num_found = 0; |
1165 | u32 tstamp = 0, tsecr = 0; |
1166 | __be32 elig_flags = 0; |
1167 | int sack_comp; |
1168 | |
1169 | /* no other possible ACKs to filter */ |
1170 | if (flow->head == flow->tail) |
1171 | return NULL; |
1172 | |
1173 | skb = flow->tail; |
1174 | tcph = cake_get_tcphdr(skb, buf: _tcph, bufsize: sizeof(_tcph)); |
1175 | iph = cake_get_iphdr(skb, buf: &_iph); |
1176 | if (!tcph) |
1177 | return NULL; |
1178 | |
1179 | cake_tcph_get_tstamp(tcph, tsval: &tstamp, tsecr: &tsecr); |
1180 | |
1181 | /* the 'triggering' packet need only have the ACK flag set. |
1182 | * also check that SYN is not set, as there won't be any previous ACKs. |
1183 | */ |
1184 | if ((tcp_flag_word(tcph) & |
1185 | (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) |
1186 | return NULL; |
1187 | |
1188 | /* the 'triggering' ACK is at the tail of the queue, we have already |
1189 | * returned if it is the only packet in the flow. loop through the rest |
1190 | * of the queue looking for pure ACKs with the same 5-tuple as the |
1191 | * triggering one. |
1192 | */ |
1193 | for (skb_check = flow->head; |
1194 | skb_check && skb_check != skb; |
1195 | skb_prev = skb_check, skb_check = skb_check->next) { |
1196 | iph_check = cake_get_iphdr(skb: skb_check, buf: &_iph_check); |
1197 | tcph_check = cake_get_tcphdr(skb: skb_check, buf: &_tcph_check, |
1198 | bufsize: sizeof(_tcph_check)); |
1199 | |
1200 | /* only TCP packets with matching 5-tuple are eligible, and only |
1201 | * drop safe headers |
1202 | */ |
1203 | if (!tcph_check || iph->version != iph_check->version || |
1204 | tcph_check->source != tcph->source || |
1205 | tcph_check->dest != tcph->dest) |
1206 | continue; |
1207 | |
1208 | if (iph_check->version == 4) { |
1209 | if (iph_check->saddr != iph->saddr || |
1210 | iph_check->daddr != iph->daddr) |
1211 | continue; |
1212 | |
1213 | seglen = iph_totlen(skb, iph: iph_check) - |
1214 | (4 * iph_check->ihl); |
1215 | } else if (iph_check->version == 6) { |
1216 | ipv6h = (struct ipv6hdr *)iph; |
1217 | ipv6h_check = (struct ipv6hdr *)iph_check; |
1218 | |
1219 | if (ipv6_addr_cmp(a1: &ipv6h_check->saddr, a2: &ipv6h->saddr) || |
1220 | ipv6_addr_cmp(a1: &ipv6h_check->daddr, a2: &ipv6h->daddr)) |
1221 | continue; |
1222 | |
1223 | seglen = ntohs(ipv6h_check->payload_len); |
1224 | } else { |
1225 | WARN_ON(1); /* shouldn't happen */ |
1226 | continue; |
1227 | } |
1228 | |
1229 | /* If the ECE/CWR flags changed from the previous eligible |
1230 | * packet in the same flow, we should no longer be dropping that |
1231 | * previous packet as this would lose information. |
1232 | */ |
1233 | if (elig_ack && (tcp_flag_word(tcph_check) & |
1234 | (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { |
1235 | elig_ack = NULL; |
1236 | elig_ack_prev = NULL; |
1237 | num_found--; |
1238 | } |
1239 | |
1240 | /* Check TCP options and flags, don't drop ACKs with segment |
1241 | * data, and don't drop ACKs with a higher cumulative ACK |
1242 | * counter than the triggering packet. Check ACK seqno here to |
1243 | * avoid parsing SACK options of packets we are going to exclude |
1244 | * anyway. |
1245 | */ |
1246 | if (!cake_tcph_may_drop(tcph: tcph_check, tstamp_new: tstamp, tsecr_new: tsecr) || |
1247 | (seglen - __tcp_hdrlen(th: tcph_check)) != 0 || |
1248 | after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) |
1249 | continue; |
1250 | |
1251 | /* Check SACK options. The triggering packet must SACK more data |
1252 | * than the ACK under consideration, or SACK the same range but |
1253 | * have a larger cumulative ACK counter. The latter is a |
1254 | * pathological case, but is contained in the following check |
1255 | * anyway, just to be safe. |
1256 | */ |
1257 | sack_comp = cake_tcph_sack_compare(tcph_a: tcph_check, tcph_b: tcph); |
1258 | |
1259 | if (sack_comp < 0 || |
1260 | (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && |
1261 | sack_comp == 0)) |
1262 | continue; |
1263 | |
1264 | /* At this point we have found an eligible pure ACK to drop; if |
1265 | * we are in aggressive mode, we are done. Otherwise, keep |
1266 | * searching unless this is the second eligible ACK we |
1267 | * found. |
1268 | * |
1269 | * Since we want to drop ACK closest to the head of the queue, |
1270 | * save the first eligible ACK we find, even if we need to loop |
1271 | * again. |
1272 | */ |
1273 | if (!elig_ack) { |
1274 | elig_ack = skb_check; |
1275 | elig_ack_prev = skb_prev; |
1276 | elig_flags = (tcp_flag_word(tcph_check) |
1277 | & (TCP_FLAG_ECE | TCP_FLAG_CWR)); |
1278 | } |
1279 | |
1280 | if (num_found++ > 0) |
1281 | goto found; |
1282 | } |
1283 | |
1284 | /* We made it through the queue without finding two eligible ACKs . If |
1285 | * we found a single eligible ACK we can drop it in aggressive mode if |
1286 | * we can guarantee that this does not interfere with ECN flag |
1287 | * information. We ensure this by dropping it only if the enqueued |
1288 | * packet is consecutive with the eligible ACK, and their flags match. |
1289 | */ |
1290 | if (elig_ack && aggressive && elig_ack->next == skb && |
1291 | (elig_flags == (tcp_flag_word(tcph) & |
1292 | (TCP_FLAG_ECE | TCP_FLAG_CWR)))) |
1293 | goto found; |
1294 | |
1295 | return NULL; |
1296 | |
1297 | found: |
1298 | if (elig_ack_prev) |
1299 | elig_ack_prev->next = elig_ack->next; |
1300 | else |
1301 | flow->head = elig_ack->next; |
1302 | |
1303 | skb_mark_not_on_list(skb: elig_ack); |
1304 | |
1305 | return elig_ack; |
1306 | } |
1307 | |
1308 | static u64 cake_ewma(u64 avg, u64 sample, u32 shift) |
1309 | { |
1310 | avg -= avg >> shift; |
1311 | avg += sample >> shift; |
1312 | return avg; |
1313 | } |
1314 | |
1315 | static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) |
1316 | { |
1317 | if (q->rate_flags & CAKE_FLAG_OVERHEAD) |
1318 | len -= off; |
1319 | |
1320 | if (q->max_netlen < len) |
1321 | q->max_netlen = len; |
1322 | if (q->min_netlen > len) |
1323 | q->min_netlen = len; |
1324 | |
1325 | len += q->rate_overhead; |
1326 | |
1327 | if (len < q->rate_mpu) |
1328 | len = q->rate_mpu; |
1329 | |
1330 | if (q->atm_mode == CAKE_ATM_ATM) { |
1331 | len += 47; |
1332 | len /= 48; |
1333 | len *= 53; |
1334 | } else if (q->atm_mode == CAKE_ATM_PTM) { |
1335 | /* Add one byte per 64 bytes or part thereof. |
1336 | * This is conservative and easier to calculate than the |
1337 | * precise value. |
1338 | */ |
1339 | len += (len + 63) / 64; |
1340 | } |
1341 | |
1342 | if (q->max_adjlen < len) |
1343 | q->max_adjlen = len; |
1344 | if (q->min_adjlen > len) |
1345 | q->min_adjlen = len; |
1346 | |
1347 | return len; |
1348 | } |
1349 | |
1350 | static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) |
1351 | { |
1352 | const struct skb_shared_info *shinfo = skb_shinfo(skb); |
1353 | unsigned int hdr_len, last_len = 0; |
1354 | u32 off = skb_network_offset(skb); |
1355 | u32 len = qdisc_pkt_len(skb); |
1356 | u16 segs = 1; |
1357 | |
1358 | q->avg_netoff = cake_ewma(avg: q->avg_netoff, sample: off << 16, shift: 8); |
1359 | |
1360 | if (!shinfo->gso_size) |
1361 | return cake_calc_overhead(q, len, off); |
1362 | |
1363 | /* borrowed from qdisc_pkt_len_init() */ |
1364 | hdr_len = skb_transport_offset(skb); |
1365 | |
1366 | /* + transport layer */ |
1367 | if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | |
1368 | SKB_GSO_TCPV6))) { |
1369 | const struct tcphdr *th; |
1370 | struct tcphdr _tcphdr; |
1371 | |
1372 | th = skb_header_pointer(skb, offset: hdr_len, |
1373 | len: sizeof(_tcphdr), buffer: &_tcphdr); |
1374 | if (likely(th)) |
1375 | hdr_len += __tcp_hdrlen(th); |
1376 | } else { |
1377 | struct udphdr _udphdr; |
1378 | |
1379 | if (skb_header_pointer(skb, offset: hdr_len, |
1380 | len: sizeof(_udphdr), buffer: &_udphdr)) |
1381 | hdr_len += sizeof(struct udphdr); |
1382 | } |
1383 | |
1384 | if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) |
1385 | segs = DIV_ROUND_UP(skb->len - hdr_len, |
1386 | shinfo->gso_size); |
1387 | else |
1388 | segs = shinfo->gso_segs; |
1389 | |
1390 | len = shinfo->gso_size + hdr_len; |
1391 | last_len = skb->len - shinfo->gso_size * (segs - 1); |
1392 | |
1393 | return (cake_calc_overhead(q, len, off) * (segs - 1) + |
1394 | cake_calc_overhead(q, len: last_len, off)); |
1395 | } |
1396 | |
1397 | static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) |
1398 | { |
1399 | struct cake_heap_entry ii = q->overflow_heap[i]; |
1400 | struct cake_heap_entry jj = q->overflow_heap[j]; |
1401 | |
1402 | q->overflow_heap[i] = jj; |
1403 | q->overflow_heap[j] = ii; |
1404 | |
1405 | q->tins[ii.t].overflow_idx[ii.b] = j; |
1406 | q->tins[jj.t].overflow_idx[jj.b] = i; |
1407 | } |
1408 | |
1409 | static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) |
1410 | { |
1411 | struct cake_heap_entry ii = q->overflow_heap[i]; |
1412 | |
1413 | return q->tins[ii.t].backlogs[ii.b]; |
1414 | } |
1415 | |
1416 | static void cake_heapify(struct cake_sched_data *q, u16 i) |
1417 | { |
1418 | static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; |
1419 | u32 mb = cake_heap_get_backlog(q, i); |
1420 | u32 m = i; |
1421 | |
1422 | while (m < a) { |
1423 | u32 l = m + m + 1; |
1424 | u32 r = l + 1; |
1425 | |
1426 | if (l < a) { |
1427 | u32 lb = cake_heap_get_backlog(q, i: l); |
1428 | |
1429 | if (lb > mb) { |
1430 | m = l; |
1431 | mb = lb; |
1432 | } |
1433 | } |
1434 | |
1435 | if (r < a) { |
1436 | u32 rb = cake_heap_get_backlog(q, i: r); |
1437 | |
1438 | if (rb > mb) { |
1439 | m = r; |
1440 | mb = rb; |
1441 | } |
1442 | } |
1443 | |
1444 | if (m != i) { |
1445 | cake_heap_swap(q, i, j: m); |
1446 | i = m; |
1447 | } else { |
1448 | break; |
1449 | } |
1450 | } |
1451 | } |
1452 | |
1453 | static void cake_heapify_up(struct cake_sched_data *q, u16 i) |
1454 | { |
1455 | while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { |
1456 | u16 p = (i - 1) >> 1; |
1457 | u32 ib = cake_heap_get_backlog(q, i); |
1458 | u32 pb = cake_heap_get_backlog(q, i: p); |
1459 | |
1460 | if (ib > pb) { |
1461 | cake_heap_swap(q, i, j: p); |
1462 | i = p; |
1463 | } else { |
1464 | break; |
1465 | } |
1466 | } |
1467 | } |
1468 | |
1469 | static int cake_advance_shaper(struct cake_sched_data *q, |
1470 | struct cake_tin_data *b, |
1471 | struct sk_buff *skb, |
1472 | ktime_t now, bool drop) |
1473 | { |
1474 | u32 len = get_cobalt_cb(skb)->adjusted_len; |
1475 | |
1476 | /* charge packet bandwidth to this tin |
1477 | * and to the global shaper. |
1478 | */ |
1479 | if (q->rate_ns) { |
1480 | u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; |
1481 | u64 global_dur = (len * q->rate_ns) >> q->rate_shft; |
1482 | u64 failsafe_dur = global_dur + (global_dur >> 1); |
1483 | |
1484 | if (ktime_before(cmp1: b->time_next_packet, cmp2: now)) |
1485 | b->time_next_packet = ktime_add_ns(b->time_next_packet, |
1486 | tin_dur); |
1487 | |
1488 | else if (ktime_before(cmp1: b->time_next_packet, |
1489 | ktime_add_ns(now, tin_dur))) |
1490 | b->time_next_packet = ktime_add_ns(now, tin_dur); |
1491 | |
1492 | q->time_next_packet = ktime_add_ns(q->time_next_packet, |
1493 | global_dur); |
1494 | if (!drop) |
1495 | q->failsafe_next_packet = \ |
1496 | ktime_add_ns(q->failsafe_next_packet, |
1497 | failsafe_dur); |
1498 | } |
1499 | return len; |
1500 | } |
1501 | |
1502 | static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) |
1503 | { |
1504 | struct cake_sched_data *q = qdisc_priv(sch); |
1505 | ktime_t now = ktime_get(); |
1506 | u32 idx = 0, tin = 0, len; |
1507 | struct cake_heap_entry qq; |
1508 | struct cake_tin_data *b; |
1509 | struct cake_flow *flow; |
1510 | struct sk_buff *skb; |
1511 | |
1512 | if (!q->overflow_timeout) { |
1513 | int i; |
1514 | /* Build fresh max-heap */ |
1515 | for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) |
1516 | cake_heapify(q, i); |
1517 | } |
1518 | q->overflow_timeout = 65535; |
1519 | |
1520 | /* select longest queue for pruning */ |
1521 | qq = q->overflow_heap[0]; |
1522 | tin = qq.t; |
1523 | idx = qq.b; |
1524 | |
1525 | b = &q->tins[tin]; |
1526 | flow = &b->flows[idx]; |
1527 | skb = dequeue_head(flow); |
1528 | if (unlikely(!skb)) { |
1529 | /* heap has gone wrong, rebuild it next time */ |
1530 | q->overflow_timeout = 0; |
1531 | return idx + (tin << 16); |
1532 | } |
1533 | |
1534 | if (cobalt_queue_full(vars: &flow->cvars, p: &b->cparams, now)) |
1535 | b->unresponsive_flow_count++; |
1536 | |
1537 | len = qdisc_pkt_len(skb); |
1538 | q->buffer_used -= skb->truesize; |
1539 | b->backlogs[idx] -= len; |
1540 | b->tin_backlog -= len; |
1541 | sch->qstats.backlog -= len; |
1542 | qdisc_tree_reduce_backlog(qdisc: sch, n: 1, len); |
1543 | |
1544 | flow->dropped++; |
1545 | b->tin_dropped++; |
1546 | sch->qstats.drops++; |
1547 | |
1548 | if (q->rate_flags & CAKE_FLAG_INGRESS) |
1549 | cake_advance_shaper(q, b, skb, now, drop: true); |
1550 | |
1551 | __qdisc_drop(skb, to_free); |
1552 | sch->q.qlen--; |
1553 | |
1554 | cake_heapify(q, i: 0); |
1555 | |
1556 | return idx + (tin << 16); |
1557 | } |
1558 | |
1559 | static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) |
1560 | { |
1561 | const int offset = skb_network_offset(skb); |
1562 | u16 *buf, buf_; |
1563 | u8 dscp; |
1564 | |
1565 | switch (skb_protocol(skb, skip_vlan: true)) { |
1566 | case htons(ETH_P_IP): |
1567 | buf = skb_header_pointer(skb, offset, len: sizeof(buf_), buffer: &buf_); |
1568 | if (unlikely(!buf)) |
1569 | return 0; |
1570 | |
1571 | /* ToS is in the second byte of iphdr */ |
1572 | dscp = ipv4_get_dsfield(iph: (struct iphdr *)buf) >> 2; |
1573 | |
1574 | if (wash && dscp) { |
1575 | const int wlen = offset + sizeof(struct iphdr); |
1576 | |
1577 | if (!pskb_may_pull(skb, len: wlen) || |
1578 | skb_try_make_writable(skb, write_len: wlen)) |
1579 | return 0; |
1580 | |
1581 | ipv4_change_dsfield(iph: ip_hdr(skb), mask: INET_ECN_MASK, value: 0); |
1582 | } |
1583 | |
1584 | return dscp; |
1585 | |
1586 | case htons(ETH_P_IPV6): |
1587 | buf = skb_header_pointer(skb, offset, len: sizeof(buf_), buffer: &buf_); |
1588 | if (unlikely(!buf)) |
1589 | return 0; |
1590 | |
1591 | /* Traffic class is in the first and second bytes of ipv6hdr */ |
1592 | dscp = ipv6_get_dsfield(ipv6h: (struct ipv6hdr *)buf) >> 2; |
1593 | |
1594 | if (wash && dscp) { |
1595 | const int wlen = offset + sizeof(struct ipv6hdr); |
1596 | |
1597 | if (!pskb_may_pull(skb, len: wlen) || |
1598 | skb_try_make_writable(skb, write_len: wlen)) |
1599 | return 0; |
1600 | |
1601 | ipv6_change_dsfield(ipv6h: ipv6_hdr(skb), mask: INET_ECN_MASK, value: 0); |
1602 | } |
1603 | |
1604 | return dscp; |
1605 | |
1606 | case htons(ETH_P_ARP): |
1607 | return 0x38; /* CS7 - Net Control */ |
1608 | |
1609 | default: |
1610 | /* If there is no Diffserv field, treat as best-effort */ |
1611 | return 0; |
1612 | } |
1613 | } |
1614 | |
1615 | static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, |
1616 | struct sk_buff *skb) |
1617 | { |
1618 | struct cake_sched_data *q = qdisc_priv(sch); |
1619 | u32 tin, mark; |
1620 | bool wash; |
1621 | u8 dscp; |
1622 | |
1623 | /* Tin selection: Default to diffserv-based selection, allow overriding |
1624 | * using firewall marks or skb->priority. Call DSCP parsing early if |
1625 | * wash is enabled, otherwise defer to below to skip unneeded parsing. |
1626 | */ |
1627 | mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; |
1628 | wash = !!(q->rate_flags & CAKE_FLAG_WASH); |
1629 | if (wash) |
1630 | dscp = cake_handle_diffserv(skb, wash); |
1631 | |
1632 | if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) |
1633 | tin = 0; |
1634 | |
1635 | else if (mark && mark <= q->tin_cnt) |
1636 | tin = q->tin_order[mark - 1]; |
1637 | |
1638 | else if (TC_H_MAJ(skb->priority) == sch->handle && |
1639 | TC_H_MIN(skb->priority) > 0 && |
1640 | TC_H_MIN(skb->priority) <= q->tin_cnt) |
1641 | tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; |
1642 | |
1643 | else { |
1644 | if (!wash) |
1645 | dscp = cake_handle_diffserv(skb, wash); |
1646 | tin = q->tin_index[dscp]; |
1647 | |
1648 | if (unlikely(tin >= q->tin_cnt)) |
1649 | tin = 0; |
1650 | } |
1651 | |
1652 | return &q->tins[tin]; |
1653 | } |
1654 | |
1655 | static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, |
1656 | struct sk_buff *skb, int flow_mode, int *qerr) |
1657 | { |
1658 | struct cake_sched_data *q = qdisc_priv(sch); |
1659 | struct tcf_proto *filter; |
1660 | struct tcf_result res; |
1661 | u16 flow = 0, host = 0; |
1662 | int result; |
1663 | |
1664 | filter = rcu_dereference_bh(q->filter_list); |
1665 | if (!filter) |
1666 | goto hash; |
1667 | |
1668 | *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; |
1669 | result = tcf_classify(skb, NULL, tp: filter, res: &res, compat_mode: false); |
1670 | |
1671 | if (result >= 0) { |
1672 | #ifdef CONFIG_NET_CLS_ACT |
1673 | switch (result) { |
1674 | case TC_ACT_STOLEN: |
1675 | case TC_ACT_QUEUED: |
1676 | case TC_ACT_TRAP: |
1677 | *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; |
1678 | fallthrough; |
1679 | case TC_ACT_SHOT: |
1680 | return 0; |
1681 | } |
1682 | #endif |
1683 | if (TC_H_MIN(res.classid) <= CAKE_QUEUES) |
1684 | flow = TC_H_MIN(res.classid); |
1685 | if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) |
1686 | host = TC_H_MAJ(res.classid) >> 16; |
1687 | } |
1688 | hash: |
1689 | *t = cake_select_tin(sch, skb); |
1690 | return cake_hash(q: *t, skb, flow_mode, flow_override: flow, host_override: host) + 1; |
1691 | } |
1692 | |
1693 | static void cake_reconfigure(struct Qdisc *sch); |
1694 | |
1695 | static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, |
1696 | struct sk_buff **to_free) |
1697 | { |
1698 | struct cake_sched_data *q = qdisc_priv(sch); |
1699 | int len = qdisc_pkt_len(skb); |
1700 | int ret; |
1701 | struct sk_buff *ack = NULL; |
1702 | ktime_t now = ktime_get(); |
1703 | struct cake_tin_data *b; |
1704 | struct cake_flow *flow; |
1705 | u32 idx; |
1706 | |
1707 | /* choose flow to insert into */ |
1708 | idx = cake_classify(sch, t: &b, skb, flow_mode: q->flow_mode, qerr: &ret); |
1709 | if (idx == 0) { |
1710 | if (ret & __NET_XMIT_BYPASS) |
1711 | qdisc_qstats_drop(sch); |
1712 | __qdisc_drop(skb, to_free); |
1713 | return ret; |
1714 | } |
1715 | idx--; |
1716 | flow = &b->flows[idx]; |
1717 | |
1718 | /* ensure shaper state isn't stale */ |
1719 | if (!b->tin_backlog) { |
1720 | if (ktime_before(cmp1: b->time_next_packet, cmp2: now)) |
1721 | b->time_next_packet = now; |
1722 | |
1723 | if (!sch->q.qlen) { |
1724 | if (ktime_before(cmp1: q->time_next_packet, cmp2: now)) { |
1725 | q->failsafe_next_packet = now; |
1726 | q->time_next_packet = now; |
1727 | } else if (ktime_after(cmp1: q->time_next_packet, cmp2: now) && |
1728 | ktime_after(cmp1: q->failsafe_next_packet, cmp2: now)) { |
1729 | u64 next = \ |
1730 | min(ktime_to_ns(q->time_next_packet), |
1731 | ktime_to_ns( |
1732 | q->failsafe_next_packet)); |
1733 | sch->qstats.overlimits++; |
1734 | qdisc_watchdog_schedule_ns(wd: &q->watchdog, expires: next); |
1735 | } |
1736 | } |
1737 | } |
1738 | |
1739 | if (unlikely(len > b->max_skblen)) |
1740 | b->max_skblen = len; |
1741 | |
1742 | if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { |
1743 | struct sk_buff *segs, *nskb; |
1744 | netdev_features_t features = netif_skb_features(skb); |
1745 | unsigned int slen = 0, numsegs = 0; |
1746 | |
1747 | segs = skb_gso_segment(skb, features: features & ~NETIF_F_GSO_MASK); |
1748 | if (IS_ERR_OR_NULL(ptr: segs)) |
1749 | return qdisc_drop(skb, sch, to_free); |
1750 | |
1751 | skb_list_walk_safe(segs, segs, nskb) { |
1752 | skb_mark_not_on_list(skb: segs); |
1753 | qdisc_skb_cb(skb: segs)->pkt_len = segs->len; |
1754 | cobalt_set_enqueue_time(skb: segs, now); |
1755 | get_cobalt_cb(skb: segs)->adjusted_len = cake_overhead(q, |
1756 | skb: segs); |
1757 | flow_queue_add(flow, skb: segs); |
1758 | |
1759 | sch->q.qlen++; |
1760 | numsegs++; |
1761 | slen += segs->len; |
1762 | q->buffer_used += segs->truesize; |
1763 | b->packets++; |
1764 | } |
1765 | |
1766 | /* stats */ |
1767 | b->bytes += slen; |
1768 | b->backlogs[idx] += slen; |
1769 | b->tin_backlog += slen; |
1770 | sch->qstats.backlog += slen; |
1771 | q->avg_window_bytes += slen; |
1772 | |
1773 | qdisc_tree_reduce_backlog(qdisc: sch, n: 1-numsegs, len: len-slen); |
1774 | consume_skb(skb); |
1775 | } else { |
1776 | /* not splitting */ |
1777 | cobalt_set_enqueue_time(skb, now); |
1778 | get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); |
1779 | flow_queue_add(flow, skb); |
1780 | |
1781 | if (q->ack_filter) |
1782 | ack = cake_ack_filter(q, flow); |
1783 | |
1784 | if (ack) { |
1785 | b->ack_drops++; |
1786 | sch->qstats.drops++; |
1787 | b->bytes += qdisc_pkt_len(skb: ack); |
1788 | len -= qdisc_pkt_len(skb: ack); |
1789 | q->buffer_used += skb->truesize - ack->truesize; |
1790 | if (q->rate_flags & CAKE_FLAG_INGRESS) |
1791 | cake_advance_shaper(q, b, skb: ack, now, drop: true); |
1792 | |
1793 | qdisc_tree_reduce_backlog(qdisc: sch, n: 1, len: qdisc_pkt_len(skb: ack)); |
1794 | consume_skb(skb: ack); |
1795 | } else { |
1796 | sch->q.qlen++; |
1797 | q->buffer_used += skb->truesize; |
1798 | } |
1799 | |
1800 | /* stats */ |
1801 | b->packets++; |
1802 | b->bytes += len; |
1803 | b->backlogs[idx] += len; |
1804 | b->tin_backlog += len; |
1805 | sch->qstats.backlog += len; |
1806 | q->avg_window_bytes += len; |
1807 | } |
1808 | |
1809 | if (q->overflow_timeout) |
1810 | cake_heapify_up(q, i: b->overflow_idx[idx]); |
1811 | |
1812 | /* incoming bandwidth capacity estimate */ |
1813 | if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { |
1814 | u64 packet_interval = \ |
1815 | ktime_to_ns(ktime_sub(now, q->last_packet_time)); |
1816 | |
1817 | if (packet_interval > NSEC_PER_SEC) |
1818 | packet_interval = NSEC_PER_SEC; |
1819 | |
1820 | /* filter out short-term bursts, eg. wifi aggregation */ |
1821 | q->avg_packet_interval = \ |
1822 | cake_ewma(avg: q->avg_packet_interval, |
1823 | sample: packet_interval, |
1824 | shift: (packet_interval > q->avg_packet_interval ? |
1825 | 2 : 8)); |
1826 | |
1827 | q->last_packet_time = now; |
1828 | |
1829 | if (packet_interval > q->avg_packet_interval) { |
1830 | u64 window_interval = \ |
1831 | ktime_to_ns(ktime_sub(now, |
1832 | q->avg_window_begin)); |
1833 | u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; |
1834 | |
1835 | b = div64_u64(dividend: b, divisor: window_interval); |
1836 | q->avg_peak_bandwidth = |
1837 | cake_ewma(avg: q->avg_peak_bandwidth, sample: b, |
1838 | shift: b > q->avg_peak_bandwidth ? 2 : 8); |
1839 | q->avg_window_bytes = 0; |
1840 | q->avg_window_begin = now; |
1841 | |
1842 | if (ktime_after(cmp1: now, |
1843 | cmp2: ktime_add_ms(kt: q->last_reconfig_time, |
1844 | msec: 250))) { |
1845 | q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; |
1846 | cake_reconfigure(sch); |
1847 | } |
1848 | } |
1849 | } else { |
1850 | q->avg_window_bytes = 0; |
1851 | q->last_packet_time = now; |
1852 | } |
1853 | |
1854 | /* flowchain */ |
1855 | if (!flow->set || flow->set == CAKE_SET_DECAYING) { |
1856 | struct cake_host *srchost = &b->hosts[flow->srchost]; |
1857 | struct cake_host *dsthost = &b->hosts[flow->dsthost]; |
1858 | u16 host_load = 1; |
1859 | |
1860 | if (!flow->set) { |
1861 | list_add_tail(new: &flow->flowchain, head: &b->new_flows); |
1862 | } else { |
1863 | b->decaying_flow_count--; |
1864 | list_move_tail(list: &flow->flowchain, head: &b->new_flows); |
1865 | } |
1866 | flow->set = CAKE_SET_SPARSE; |
1867 | b->sparse_flow_count++; |
1868 | |
1869 | if (cake_dsrc(flow_mode: q->flow_mode)) |
1870 | host_load = max(host_load, srchost->srchost_bulk_flow_count); |
1871 | |
1872 | if (cake_ddst(flow_mode: q->flow_mode)) |
1873 | host_load = max(host_load, dsthost->dsthost_bulk_flow_count); |
1874 | |
1875 | flow->deficit = (b->flow_quantum * |
1876 | quantum_div[host_load]) >> 16; |
1877 | } else if (flow->set == CAKE_SET_SPARSE_WAIT) { |
1878 | struct cake_host *srchost = &b->hosts[flow->srchost]; |
1879 | struct cake_host *dsthost = &b->hosts[flow->dsthost]; |
1880 | |
1881 | /* this flow was empty, accounted as a sparse flow, but actually |
1882 | * in the bulk rotation. |
1883 | */ |
1884 | flow->set = CAKE_SET_BULK; |
1885 | b->sparse_flow_count--; |
1886 | b->bulk_flow_count++; |
1887 | |
1888 | if (cake_dsrc(flow_mode: q->flow_mode)) |
1889 | srchost->srchost_bulk_flow_count++; |
1890 | |
1891 | if (cake_ddst(flow_mode: q->flow_mode)) |
1892 | dsthost->dsthost_bulk_flow_count++; |
1893 | |
1894 | } |
1895 | |
1896 | if (q->buffer_used > q->buffer_max_used) |
1897 | q->buffer_max_used = q->buffer_used; |
1898 | |
1899 | if (q->buffer_used > q->buffer_limit) { |
1900 | u32 dropped = 0; |
1901 | |
1902 | while (q->buffer_used > q->buffer_limit) { |
1903 | dropped++; |
1904 | cake_drop(sch, to_free); |
1905 | } |
1906 | b->drop_overlimit += dropped; |
1907 | } |
1908 | return NET_XMIT_SUCCESS; |
1909 | } |
1910 | |
1911 | static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) |
1912 | { |
1913 | struct cake_sched_data *q = qdisc_priv(sch); |
1914 | struct cake_tin_data *b = &q->tins[q->cur_tin]; |
1915 | struct cake_flow *flow = &b->flows[q->cur_flow]; |
1916 | struct sk_buff *skb = NULL; |
1917 | u32 len; |
1918 | |
1919 | if (flow->head) { |
1920 | skb = dequeue_head(flow); |
1921 | len = qdisc_pkt_len(skb); |
1922 | b->backlogs[q->cur_flow] -= len; |
1923 | b->tin_backlog -= len; |
1924 | sch->qstats.backlog -= len; |
1925 | q->buffer_used -= skb->truesize; |
1926 | sch->q.qlen--; |
1927 | |
1928 | if (q->overflow_timeout) |
1929 | cake_heapify(q, i: b->overflow_idx[q->cur_flow]); |
1930 | } |
1931 | return skb; |
1932 | } |
1933 | |
1934 | /* Discard leftover packets from a tin no longer in use. */ |
1935 | static void cake_clear_tin(struct Qdisc *sch, u16 tin) |
1936 | { |
1937 | struct cake_sched_data *q = qdisc_priv(sch); |
1938 | struct sk_buff *skb; |
1939 | |
1940 | q->cur_tin = tin; |
1941 | for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) |
1942 | while (!!(skb = cake_dequeue_one(sch))) |
1943 | kfree_skb(skb); |
1944 | } |
1945 | |
1946 | static struct sk_buff *cake_dequeue(struct Qdisc *sch) |
1947 | { |
1948 | struct cake_sched_data *q = qdisc_priv(sch); |
1949 | struct cake_tin_data *b = &q->tins[q->cur_tin]; |
1950 | struct cake_host *srchost, *dsthost; |
1951 | ktime_t now = ktime_get(); |
1952 | struct cake_flow *flow; |
1953 | struct list_head *head; |
1954 | bool first_flow = true; |
1955 | struct sk_buff *skb; |
1956 | u16 host_load; |
1957 | u64 delay; |
1958 | u32 len; |
1959 | |
1960 | begin: |
1961 | if (!sch->q.qlen) |
1962 | return NULL; |
1963 | |
1964 | /* global hard shaper */ |
1965 | if (ktime_after(cmp1: q->time_next_packet, cmp2: now) && |
1966 | ktime_after(cmp1: q->failsafe_next_packet, cmp2: now)) { |
1967 | u64 next = min(ktime_to_ns(q->time_next_packet), |
1968 | ktime_to_ns(q->failsafe_next_packet)); |
1969 | |
1970 | sch->qstats.overlimits++; |
1971 | qdisc_watchdog_schedule_ns(wd: &q->watchdog, expires: next); |
1972 | return NULL; |
1973 | } |
1974 | |
1975 | /* Choose a class to work on. */ |
1976 | if (!q->rate_ns) { |
1977 | /* In unlimited mode, can't rely on shaper timings, just balance |
1978 | * with DRR |
1979 | */ |
1980 | bool wrapped = false, empty = true; |
1981 | |
1982 | while (b->tin_deficit < 0 || |
1983 | !(b->sparse_flow_count + b->bulk_flow_count)) { |
1984 | if (b->tin_deficit <= 0) |
1985 | b->tin_deficit += b->tin_quantum; |
1986 | if (b->sparse_flow_count + b->bulk_flow_count) |
1987 | empty = false; |
1988 | |
1989 | q->cur_tin++; |
1990 | b++; |
1991 | if (q->cur_tin >= q->tin_cnt) { |
1992 | q->cur_tin = 0; |
1993 | b = q->tins; |
1994 | |
1995 | if (wrapped) { |
1996 | /* It's possible for q->qlen to be |
1997 | * nonzero when we actually have no |
1998 | * packets anywhere. |
1999 | */ |
2000 | if (empty) |
2001 | return NULL; |
2002 | } else { |
2003 | wrapped = true; |
2004 | } |
2005 | } |
2006 | } |
2007 | } else { |
2008 | /* In shaped mode, choose: |
2009 | * - Highest-priority tin with queue and meeting schedule, or |
2010 | * - The earliest-scheduled tin with queue. |
2011 | */ |
2012 | ktime_t best_time = KTIME_MAX; |
2013 | int tin, best_tin = 0; |
2014 | |
2015 | for (tin = 0; tin < q->tin_cnt; tin++) { |
2016 | b = q->tins + tin; |
2017 | if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { |
2018 | ktime_t time_to_pkt = \ |
2019 | ktime_sub(b->time_next_packet, now); |
2020 | |
2021 | if (ktime_to_ns(kt: time_to_pkt) <= 0 || |
2022 | ktime_compare(cmp1: time_to_pkt, |
2023 | cmp2: best_time) <= 0) { |
2024 | best_time = time_to_pkt; |
2025 | best_tin = tin; |
2026 | } |
2027 | } |
2028 | } |
2029 | |
2030 | q->cur_tin = best_tin; |
2031 | b = q->tins + best_tin; |
2032 | |
2033 | /* No point in going further if no packets to deliver. */ |
2034 | if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) |
2035 | return NULL; |
2036 | } |
2037 | |
2038 | retry: |
2039 | /* service this class */ |
2040 | head = &b->decaying_flows; |
2041 | if (!first_flow || list_empty(head)) { |
2042 | head = &b->new_flows; |
2043 | if (list_empty(head)) { |
2044 | head = &b->old_flows; |
2045 | if (unlikely(list_empty(head))) { |
2046 | head = &b->decaying_flows; |
2047 | if (unlikely(list_empty(head))) |
2048 | goto begin; |
2049 | } |
2050 | } |
2051 | } |
2052 | flow = list_first_entry(head, struct cake_flow, flowchain); |
2053 | q->cur_flow = flow - b->flows; |
2054 | first_flow = false; |
2055 | |
2056 | /* triple isolation (modified DRR++) */ |
2057 | srchost = &b->hosts[flow->srchost]; |
2058 | dsthost = &b->hosts[flow->dsthost]; |
2059 | host_load = 1; |
2060 | |
2061 | /* flow isolation (DRR++) */ |
2062 | if (flow->deficit <= 0) { |
2063 | /* Keep all flows with deficits out of the sparse and decaying |
2064 | * rotations. No non-empty flow can go into the decaying |
2065 | * rotation, so they can't get deficits |
2066 | */ |
2067 | if (flow->set == CAKE_SET_SPARSE) { |
2068 | if (flow->head) { |
2069 | b->sparse_flow_count--; |
2070 | b->bulk_flow_count++; |
2071 | |
2072 | if (cake_dsrc(flow_mode: q->flow_mode)) |
2073 | srchost->srchost_bulk_flow_count++; |
2074 | |
2075 | if (cake_ddst(flow_mode: q->flow_mode)) |
2076 | dsthost->dsthost_bulk_flow_count++; |
2077 | |
2078 | flow->set = CAKE_SET_BULK; |
2079 | } else { |
2080 | /* we've moved it to the bulk rotation for |
2081 | * correct deficit accounting but we still want |
2082 | * to count it as a sparse flow, not a bulk one. |
2083 | */ |
2084 | flow->set = CAKE_SET_SPARSE_WAIT; |
2085 | } |
2086 | } |
2087 | |
2088 | if (cake_dsrc(flow_mode: q->flow_mode)) |
2089 | host_load = max(host_load, srchost->srchost_bulk_flow_count); |
2090 | |
2091 | if (cake_ddst(flow_mode: q->flow_mode)) |
2092 | host_load = max(host_load, dsthost->dsthost_bulk_flow_count); |
2093 | |
2094 | WARN_ON(host_load > CAKE_QUEUES); |
2095 | |
2096 | /* The get_random_u16() is a way to apply dithering to avoid |
2097 | * accumulating roundoff errors |
2098 | */ |
2099 | flow->deficit += (b->flow_quantum * quantum_div[host_load] + |
2100 | get_random_u16()) >> 16; |
2101 | list_move_tail(list: &flow->flowchain, head: &b->old_flows); |
2102 | |
2103 | goto retry; |
2104 | } |
2105 | |
2106 | /* Retrieve a packet via the AQM */ |
2107 | while (1) { |
2108 | skb = cake_dequeue_one(sch); |
2109 | if (!skb) { |
2110 | /* this queue was actually empty */ |
2111 | if (cobalt_queue_empty(vars: &flow->cvars, p: &b->cparams, now)) |
2112 | b->unresponsive_flow_count--; |
2113 | |
2114 | if (flow->cvars.p_drop || flow->cvars.count || |
2115 | ktime_before(cmp1: now, cmp2: flow->cvars.drop_next)) { |
2116 | /* keep in the flowchain until the state has |
2117 | * decayed to rest |
2118 | */ |
2119 | list_move_tail(list: &flow->flowchain, |
2120 | head: &b->decaying_flows); |
2121 | if (flow->set == CAKE_SET_BULK) { |
2122 | b->bulk_flow_count--; |
2123 | |
2124 | if (cake_dsrc(flow_mode: q->flow_mode)) |
2125 | srchost->srchost_bulk_flow_count--; |
2126 | |
2127 | if (cake_ddst(flow_mode: q->flow_mode)) |
2128 | dsthost->dsthost_bulk_flow_count--; |
2129 | |
2130 | b->decaying_flow_count++; |
2131 | } else if (flow->set == CAKE_SET_SPARSE || |
2132 | flow->set == CAKE_SET_SPARSE_WAIT) { |
2133 | b->sparse_flow_count--; |
2134 | b->decaying_flow_count++; |
2135 | } |
2136 | flow->set = CAKE_SET_DECAYING; |
2137 | } else { |
2138 | /* remove empty queue from the flowchain */ |
2139 | list_del_init(entry: &flow->flowchain); |
2140 | if (flow->set == CAKE_SET_SPARSE || |
2141 | flow->set == CAKE_SET_SPARSE_WAIT) |
2142 | b->sparse_flow_count--; |
2143 | else if (flow->set == CAKE_SET_BULK) { |
2144 | b->bulk_flow_count--; |
2145 | |
2146 | if (cake_dsrc(flow_mode: q->flow_mode)) |
2147 | srchost->srchost_bulk_flow_count--; |
2148 | |
2149 | if (cake_ddst(flow_mode: q->flow_mode)) |
2150 | dsthost->dsthost_bulk_flow_count--; |
2151 | |
2152 | } else |
2153 | b->decaying_flow_count--; |
2154 | |
2155 | flow->set = CAKE_SET_NONE; |
2156 | } |
2157 | goto begin; |
2158 | } |
2159 | |
2160 | /* Last packet in queue may be marked, shouldn't be dropped */ |
2161 | if (!cobalt_should_drop(vars: &flow->cvars, p: &b->cparams, now, skb, |
2162 | bulk_flows: (b->bulk_flow_count * |
2163 | !!(q->rate_flags & |
2164 | CAKE_FLAG_INGRESS))) || |
2165 | !flow->head) |
2166 | break; |
2167 | |
2168 | /* drop this packet, get another one */ |
2169 | if (q->rate_flags & CAKE_FLAG_INGRESS) { |
2170 | len = cake_advance_shaper(q, b, skb, |
2171 | now, drop: true); |
2172 | flow->deficit -= len; |
2173 | b->tin_deficit -= len; |
2174 | } |
2175 | flow->dropped++; |
2176 | b->tin_dropped++; |
2177 | qdisc_tree_reduce_backlog(qdisc: sch, n: 1, len: qdisc_pkt_len(skb)); |
2178 | qdisc_qstats_drop(sch); |
2179 | kfree_skb(skb); |
2180 | if (q->rate_flags & CAKE_FLAG_INGRESS) |
2181 | goto retry; |
2182 | } |
2183 | |
2184 | b->tin_ecn_mark += !!flow->cvars.ecn_marked; |
2185 | qdisc_bstats_update(sch, skb); |
2186 | |
2187 | /* collect delay stats */ |
2188 | delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); |
2189 | b->avge_delay = cake_ewma(avg: b->avge_delay, sample: delay, shift: 8); |
2190 | b->peak_delay = cake_ewma(avg: b->peak_delay, sample: delay, |
2191 | shift: delay > b->peak_delay ? 2 : 8); |
2192 | b->base_delay = cake_ewma(avg: b->base_delay, sample: delay, |
2193 | shift: delay < b->base_delay ? 2 : 8); |
2194 | |
2195 | len = cake_advance_shaper(q, b, skb, now, drop: false); |
2196 | flow->deficit -= len; |
2197 | b->tin_deficit -= len; |
2198 | |
2199 | if (ktime_after(cmp1: q->time_next_packet, cmp2: now) && sch->q.qlen) { |
2200 | u64 next = min(ktime_to_ns(q->time_next_packet), |
2201 | ktime_to_ns(q->failsafe_next_packet)); |
2202 | |
2203 | qdisc_watchdog_schedule_ns(wd: &q->watchdog, expires: next); |
2204 | } else if (!sch->q.qlen) { |
2205 | int i; |
2206 | |
2207 | for (i = 0; i < q->tin_cnt; i++) { |
2208 | if (q->tins[i].decaying_flow_count) { |
2209 | ktime_t next = \ |
2210 | ktime_add_ns(now, |
2211 | q->tins[i].cparams.target); |
2212 | |
2213 | qdisc_watchdog_schedule_ns(wd: &q->watchdog, |
2214 | expires: ktime_to_ns(kt: next)); |
2215 | break; |
2216 | } |
2217 | } |
2218 | } |
2219 | |
2220 | if (q->overflow_timeout) |
2221 | q->overflow_timeout--; |
2222 | |
2223 | return skb; |
2224 | } |
2225 | |
2226 | static void cake_reset(struct Qdisc *sch) |
2227 | { |
2228 | struct cake_sched_data *q = qdisc_priv(sch); |
2229 | u32 c; |
2230 | |
2231 | if (!q->tins) |
2232 | return; |
2233 | |
2234 | for (c = 0; c < CAKE_MAX_TINS; c++) |
2235 | cake_clear_tin(sch, tin: c); |
2236 | } |
2237 | |
2238 | static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { |
2239 | [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, |
2240 | [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, |
2241 | [TCA_CAKE_ATM] = { .type = NLA_U32 }, |
2242 | [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, |
2243 | [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, |
2244 | [TCA_CAKE_RTT] = { .type = NLA_U32 }, |
2245 | [TCA_CAKE_TARGET] = { .type = NLA_U32 }, |
2246 | [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, |
2247 | [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, |
2248 | [TCA_CAKE_NAT] = { .type = NLA_U32 }, |
2249 | [TCA_CAKE_RAW] = { .type = NLA_U32 }, |
2250 | [TCA_CAKE_WASH] = { .type = NLA_U32 }, |
2251 | [TCA_CAKE_MPU] = { .type = NLA_U32 }, |
2252 | [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, |
2253 | [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, |
2254 | [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, |
2255 | [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, |
2256 | }; |
2257 | |
2258 | static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, |
2259 | u64 target_ns, u64 rtt_est_ns) |
2260 | { |
2261 | /* convert byte-rate into time-per-byte |
2262 | * so it will always unwedge in reasonable time. |
2263 | */ |
2264 | static const u64 MIN_RATE = 64; |
2265 | u32 byte_target = mtu; |
2266 | u64 byte_target_ns; |
2267 | u8 rate_shft = 0; |
2268 | u64 rate_ns = 0; |
2269 | |
2270 | b->flow_quantum = 1514; |
2271 | if (rate) { |
2272 | b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); |
2273 | rate_shft = 34; |
2274 | rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; |
2275 | rate_ns = div64_u64(dividend: rate_ns, max(MIN_RATE, rate)); |
2276 | while (!!(rate_ns >> 34)) { |
2277 | rate_ns >>= 1; |
2278 | rate_shft--; |
2279 | } |
2280 | } /* else unlimited, ie. zero delay */ |
2281 | |
2282 | b->tin_rate_bps = rate; |
2283 | b->tin_rate_ns = rate_ns; |
2284 | b->tin_rate_shft = rate_shft; |
2285 | |
2286 | byte_target_ns = (byte_target * rate_ns) >> rate_shft; |
2287 | |
2288 | b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); |
2289 | b->cparams.interval = max(rtt_est_ns + |
2290 | b->cparams.target - target_ns, |
2291 | b->cparams.target * 2); |
2292 | b->cparams.mtu_time = byte_target_ns; |
2293 | b->cparams.p_inc = 1 << 24; /* 1/256 */ |
2294 | b->cparams.p_dec = 1 << 20; /* 1/4096 */ |
2295 | } |
2296 | |
2297 | static int cake_config_besteffort(struct Qdisc *sch) |
2298 | { |
2299 | struct cake_sched_data *q = qdisc_priv(sch); |
2300 | struct cake_tin_data *b = &q->tins[0]; |
2301 | u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch)); |
2302 | u64 rate = q->rate_bps; |
2303 | |
2304 | q->tin_cnt = 1; |
2305 | |
2306 | q->tin_index = besteffort; |
2307 | q->tin_order = normal_order; |
2308 | |
2309 | cake_set_rate(b, rate, mtu, |
2310 | target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval)); |
2311 | b->tin_quantum = 65535; |
2312 | |
2313 | return 0; |
2314 | } |
2315 | |
2316 | static int cake_config_precedence(struct Qdisc *sch) |
2317 | { |
2318 | /* convert high-level (user visible) parameters into internal format */ |
2319 | struct cake_sched_data *q = qdisc_priv(sch); |
2320 | u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch)); |
2321 | u64 rate = q->rate_bps; |
2322 | u32 quantum = 256; |
2323 | u32 i; |
2324 | |
2325 | q->tin_cnt = 8; |
2326 | q->tin_index = precedence; |
2327 | q->tin_order = normal_order; |
2328 | |
2329 | for (i = 0; i < q->tin_cnt; i++) { |
2330 | struct cake_tin_data *b = &q->tins[i]; |
2331 | |
2332 | cake_set_rate(b, rate, mtu, target_ns: us_to_ns(us: q->target), |
2333 | rtt_est_ns: us_to_ns(us: q->interval)); |
2334 | |
2335 | b->tin_quantum = max_t(u16, 1U, quantum); |
2336 | |
2337 | /* calculate next class's parameters */ |
2338 | rate *= 7; |
2339 | rate >>= 3; |
2340 | |
2341 | quantum *= 7; |
2342 | quantum >>= 3; |
2343 | } |
2344 | |
2345 | return 0; |
2346 | } |
2347 | |
2348 | /* List of known Diffserv codepoints: |
2349 | * |
2350 | * Default Forwarding (DF/CS0) - Best Effort |
2351 | * Max Throughput (TOS2) |
2352 | * Min Delay (TOS4) |
2353 | * LLT "La" (TOS5) |
2354 | * Assured Forwarding 1 (AF1x) - x3 |
2355 | * Assured Forwarding 2 (AF2x) - x3 |
2356 | * Assured Forwarding 3 (AF3x) - x3 |
2357 | * Assured Forwarding 4 (AF4x) - x3 |
2358 | * Precedence Class 1 (CS1) |
2359 | * Precedence Class 2 (CS2) |
2360 | * Precedence Class 3 (CS3) |
2361 | * Precedence Class 4 (CS4) |
2362 | * Precedence Class 5 (CS5) |
2363 | * Precedence Class 6 (CS6) |
2364 | * Precedence Class 7 (CS7) |
2365 | * Voice Admit (VA) |
2366 | * Expedited Forwarding (EF) |
2367 | * Lower Effort (LE) |
2368 | * |
2369 | * Total 26 codepoints. |
2370 | */ |
2371 | |
2372 | /* List of traffic classes in RFC 4594, updated by RFC 8622: |
2373 | * (roughly descending order of contended priority) |
2374 | * (roughly ascending order of uncontended throughput) |
2375 | * |
2376 | * Network Control (CS6,CS7) - routing traffic |
2377 | * Telephony (EF,VA) - aka. VoIP streams |
2378 | * Signalling (CS5) - VoIP setup |
2379 | * Multimedia Conferencing (AF4x) - aka. video calls |
2380 | * Realtime Interactive (CS4) - eg. games |
2381 | * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch |
2382 | * Broadcast Video (CS3) |
2383 | * Low-Latency Data (AF2x,TOS4) - eg. database |
2384 | * Ops, Admin, Management (CS2) - eg. ssh |
2385 | * Standard Service (DF & unrecognised codepoints) |
2386 | * High-Throughput Data (AF1x,TOS2) - eg. web traffic |
2387 | * Low-Priority Data (LE,CS1) - eg. BitTorrent |
2388 | * |
2389 | * Total 12 traffic classes. |
2390 | */ |
2391 | |
2392 | static int cake_config_diffserv8(struct Qdisc *sch) |
2393 | { |
2394 | /* Pruned list of traffic classes for typical applications: |
2395 | * |
2396 | * Network Control (CS6, CS7) |
2397 | * Minimum Latency (EF, VA, CS5, CS4) |
2398 | * Interactive Shell (CS2) |
2399 | * Low Latency Transactions (AF2x, TOS4) |
2400 | * Video Streaming (AF4x, AF3x, CS3) |
2401 | * Bog Standard (DF etc.) |
2402 | * High Throughput (AF1x, TOS2, CS1) |
2403 | * Background Traffic (LE) |
2404 | * |
2405 | * Total 8 traffic classes. |
2406 | */ |
2407 | |
2408 | struct cake_sched_data *q = qdisc_priv(sch); |
2409 | u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch)); |
2410 | u64 rate = q->rate_bps; |
2411 | u32 quantum = 256; |
2412 | u32 i; |
2413 | |
2414 | q->tin_cnt = 8; |
2415 | |
2416 | /* codepoint to class mapping */ |
2417 | q->tin_index = diffserv8; |
2418 | q->tin_order = normal_order; |
2419 | |
2420 | /* class characteristics */ |
2421 | for (i = 0; i < q->tin_cnt; i++) { |
2422 | struct cake_tin_data *b = &q->tins[i]; |
2423 | |
2424 | cake_set_rate(b, rate, mtu, target_ns: us_to_ns(us: q->target), |
2425 | rtt_est_ns: us_to_ns(us: q->interval)); |
2426 | |
2427 | b->tin_quantum = max_t(u16, 1U, quantum); |
2428 | |
2429 | /* calculate next class's parameters */ |
2430 | rate *= 7; |
2431 | rate >>= 3; |
2432 | |
2433 | quantum *= 7; |
2434 | quantum >>= 3; |
2435 | } |
2436 | |
2437 | return 0; |
2438 | } |
2439 | |
2440 | static int cake_config_diffserv4(struct Qdisc *sch) |
2441 | { |
2442 | /* Further pruned list of traffic classes for four-class system: |
2443 | * |
2444 | * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) |
2445 | * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) |
2446 | * Best Effort (DF, AF1x, TOS2, and those not specified) |
2447 | * Background Traffic (LE, CS1) |
2448 | * |
2449 | * Total 4 traffic classes. |
2450 | */ |
2451 | |
2452 | struct cake_sched_data *q = qdisc_priv(sch); |
2453 | u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch)); |
2454 | u64 rate = q->rate_bps; |
2455 | u32 quantum = 1024; |
2456 | |
2457 | q->tin_cnt = 4; |
2458 | |
2459 | /* codepoint to class mapping */ |
2460 | q->tin_index = diffserv4; |
2461 | q->tin_order = bulk_order; |
2462 | |
2463 | /* class characteristics */ |
2464 | cake_set_rate(b: &q->tins[0], rate, mtu, |
2465 | target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval)); |
2466 | cake_set_rate(b: &q->tins[1], rate: rate >> 4, mtu, |
2467 | target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval)); |
2468 | cake_set_rate(b: &q->tins[2], rate: rate >> 1, mtu, |
2469 | target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval)); |
2470 | cake_set_rate(b: &q->tins[3], rate: rate >> 2, mtu, |
2471 | target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval)); |
2472 | |
2473 | /* bandwidth-sharing weights */ |
2474 | q->tins[0].tin_quantum = quantum; |
2475 | q->tins[1].tin_quantum = quantum >> 4; |
2476 | q->tins[2].tin_quantum = quantum >> 1; |
2477 | q->tins[3].tin_quantum = quantum >> 2; |
2478 | |
2479 | return 0; |
2480 | } |
2481 | |
2482 | static int cake_config_diffserv3(struct Qdisc *sch) |
2483 | { |
2484 | /* Simplified Diffserv structure with 3 tins. |
2485 | * Latency Sensitive (CS7, CS6, EF, VA, TOS4) |
2486 | * Best Effort |
2487 | * Low Priority (LE, CS1) |
2488 | */ |
2489 | struct cake_sched_data *q = qdisc_priv(sch); |
2490 | u32 mtu = psched_mtu(dev: qdisc_dev(qdisc: sch)); |
2491 | u64 rate = q->rate_bps; |
2492 | u32 quantum = 1024; |
2493 | |
2494 | q->tin_cnt = 3; |
2495 | |
2496 | /* codepoint to class mapping */ |
2497 | q->tin_index = diffserv3; |
2498 | q->tin_order = bulk_order; |
2499 | |
2500 | /* class characteristics */ |
2501 | cake_set_rate(b: &q->tins[0], rate, mtu, |
2502 | target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval)); |
2503 | cake_set_rate(b: &q->tins[1], rate: rate >> 4, mtu, |
2504 | target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval)); |
2505 | cake_set_rate(b: &q->tins[2], rate: rate >> 2, mtu, |
2506 | target_ns: us_to_ns(us: q->target), rtt_est_ns: us_to_ns(us: q->interval)); |
2507 | |
2508 | /* bandwidth-sharing weights */ |
2509 | q->tins[0].tin_quantum = quantum; |
2510 | q->tins[1].tin_quantum = quantum >> 4; |
2511 | q->tins[2].tin_quantum = quantum >> 2; |
2512 | |
2513 | return 0; |
2514 | } |
2515 | |
2516 | static void cake_reconfigure(struct Qdisc *sch) |
2517 | { |
2518 | struct cake_sched_data *q = qdisc_priv(sch); |
2519 | int c, ft; |
2520 | |
2521 | switch (q->tin_mode) { |
2522 | case CAKE_DIFFSERV_BESTEFFORT: |
2523 | ft = cake_config_besteffort(sch); |
2524 | break; |
2525 | |
2526 | case CAKE_DIFFSERV_PRECEDENCE: |
2527 | ft = cake_config_precedence(sch); |
2528 | break; |
2529 | |
2530 | case CAKE_DIFFSERV_DIFFSERV8: |
2531 | ft = cake_config_diffserv8(sch); |
2532 | break; |
2533 | |
2534 | case CAKE_DIFFSERV_DIFFSERV4: |
2535 | ft = cake_config_diffserv4(sch); |
2536 | break; |
2537 | |
2538 | case CAKE_DIFFSERV_DIFFSERV3: |
2539 | default: |
2540 | ft = cake_config_diffserv3(sch); |
2541 | break; |
2542 | } |
2543 | |
2544 | for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { |
2545 | cake_clear_tin(sch, tin: c); |
2546 | q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; |
2547 | } |
2548 | |
2549 | q->rate_ns = q->tins[ft].tin_rate_ns; |
2550 | q->rate_shft = q->tins[ft].tin_rate_shft; |
2551 | |
2552 | if (q->buffer_config_limit) { |
2553 | q->buffer_limit = q->buffer_config_limit; |
2554 | } else if (q->rate_bps) { |
2555 | u64 t = q->rate_bps * q->interval; |
2556 | |
2557 | do_div(t, USEC_PER_SEC / 4); |
2558 | q->buffer_limit = max_t(u32, t, 4U << 20); |
2559 | } else { |
2560 | q->buffer_limit = ~0; |
2561 | } |
2562 | |
2563 | sch->flags &= ~TCQ_F_CAN_BYPASS; |
2564 | |
2565 | q->buffer_limit = min(q->buffer_limit, |
2566 | max(sch->limit * psched_mtu(qdisc_dev(sch)), |
2567 | q->buffer_config_limit)); |
2568 | } |
2569 | |
2570 | static int cake_change(struct Qdisc *sch, struct nlattr *opt, |
2571 | struct netlink_ext_ack *extack) |
2572 | { |
2573 | struct cake_sched_data *q = qdisc_priv(sch); |
2574 | struct nlattr *tb[TCA_CAKE_MAX + 1]; |
2575 | int err; |
2576 | |
2577 | err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, nla: opt, policy: cake_policy, |
2578 | extack); |
2579 | if (err < 0) |
2580 | return err; |
2581 | |
2582 | if (tb[TCA_CAKE_NAT]) { |
2583 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
2584 | q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; |
2585 | q->flow_mode |= CAKE_FLOW_NAT_FLAG * |
2586 | !!nla_get_u32(nla: tb[TCA_CAKE_NAT]); |
2587 | #else |
2588 | NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], |
2589 | "No conntrack support in kernel" ); |
2590 | return -EOPNOTSUPP; |
2591 | #endif |
2592 | } |
2593 | |
2594 | if (tb[TCA_CAKE_BASE_RATE64]) |
2595 | q->rate_bps = nla_get_u64(nla: tb[TCA_CAKE_BASE_RATE64]); |
2596 | |
2597 | if (tb[TCA_CAKE_DIFFSERV_MODE]) |
2598 | q->tin_mode = nla_get_u32(nla: tb[TCA_CAKE_DIFFSERV_MODE]); |
2599 | |
2600 | if (tb[TCA_CAKE_WASH]) { |
2601 | if (!!nla_get_u32(nla: tb[TCA_CAKE_WASH])) |
2602 | q->rate_flags |= CAKE_FLAG_WASH; |
2603 | else |
2604 | q->rate_flags &= ~CAKE_FLAG_WASH; |
2605 | } |
2606 | |
2607 | if (tb[TCA_CAKE_FLOW_MODE]) |
2608 | q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | |
2609 | (nla_get_u32(nla: tb[TCA_CAKE_FLOW_MODE]) & |
2610 | CAKE_FLOW_MASK)); |
2611 | |
2612 | if (tb[TCA_CAKE_ATM]) |
2613 | q->atm_mode = nla_get_u32(nla: tb[TCA_CAKE_ATM]); |
2614 | |
2615 | if (tb[TCA_CAKE_OVERHEAD]) { |
2616 | q->rate_overhead = nla_get_s32(nla: tb[TCA_CAKE_OVERHEAD]); |
2617 | q->rate_flags |= CAKE_FLAG_OVERHEAD; |
2618 | |
2619 | q->max_netlen = 0; |
2620 | q->max_adjlen = 0; |
2621 | q->min_netlen = ~0; |
2622 | q->min_adjlen = ~0; |
2623 | } |
2624 | |
2625 | if (tb[TCA_CAKE_RAW]) { |
2626 | q->rate_flags &= ~CAKE_FLAG_OVERHEAD; |
2627 | |
2628 | q->max_netlen = 0; |
2629 | q->max_adjlen = 0; |
2630 | q->min_netlen = ~0; |
2631 | q->min_adjlen = ~0; |
2632 | } |
2633 | |
2634 | if (tb[TCA_CAKE_MPU]) |
2635 | q->rate_mpu = nla_get_u32(nla: tb[TCA_CAKE_MPU]); |
2636 | |
2637 | if (tb[TCA_CAKE_RTT]) { |
2638 | q->interval = nla_get_u32(nla: tb[TCA_CAKE_RTT]); |
2639 | |
2640 | if (!q->interval) |
2641 | q->interval = 1; |
2642 | } |
2643 | |
2644 | if (tb[TCA_CAKE_TARGET]) { |
2645 | q->target = nla_get_u32(nla: tb[TCA_CAKE_TARGET]); |
2646 | |
2647 | if (!q->target) |
2648 | q->target = 1; |
2649 | } |
2650 | |
2651 | if (tb[TCA_CAKE_AUTORATE]) { |
2652 | if (!!nla_get_u32(nla: tb[TCA_CAKE_AUTORATE])) |
2653 | q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; |
2654 | else |
2655 | q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; |
2656 | } |
2657 | |
2658 | if (tb[TCA_CAKE_INGRESS]) { |
2659 | if (!!nla_get_u32(nla: tb[TCA_CAKE_INGRESS])) |
2660 | q->rate_flags |= CAKE_FLAG_INGRESS; |
2661 | else |
2662 | q->rate_flags &= ~CAKE_FLAG_INGRESS; |
2663 | } |
2664 | |
2665 | if (tb[TCA_CAKE_ACK_FILTER]) |
2666 | q->ack_filter = nla_get_u32(nla: tb[TCA_CAKE_ACK_FILTER]); |
2667 | |
2668 | if (tb[TCA_CAKE_MEMORY]) |
2669 | q->buffer_config_limit = nla_get_u32(nla: tb[TCA_CAKE_MEMORY]); |
2670 | |
2671 | if (tb[TCA_CAKE_SPLIT_GSO]) { |
2672 | if (!!nla_get_u32(nla: tb[TCA_CAKE_SPLIT_GSO])) |
2673 | q->rate_flags |= CAKE_FLAG_SPLIT_GSO; |
2674 | else |
2675 | q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; |
2676 | } |
2677 | |
2678 | if (tb[TCA_CAKE_FWMARK]) { |
2679 | q->fwmark_mask = nla_get_u32(nla: tb[TCA_CAKE_FWMARK]); |
2680 | q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0; |
2681 | } |
2682 | |
2683 | if (q->tins) { |
2684 | sch_tree_lock(q: sch); |
2685 | cake_reconfigure(sch); |
2686 | sch_tree_unlock(q: sch); |
2687 | } |
2688 | |
2689 | return 0; |
2690 | } |
2691 | |
2692 | static void cake_destroy(struct Qdisc *sch) |
2693 | { |
2694 | struct cake_sched_data *q = qdisc_priv(sch); |
2695 | |
2696 | qdisc_watchdog_cancel(wd: &q->watchdog); |
2697 | tcf_block_put(block: q->block); |
2698 | kvfree(addr: q->tins); |
2699 | } |
2700 | |
2701 | static int cake_init(struct Qdisc *sch, struct nlattr *opt, |
2702 | struct netlink_ext_ack *extack) |
2703 | { |
2704 | struct cake_sched_data *q = qdisc_priv(sch); |
2705 | int i, j, err; |
2706 | |
2707 | sch->limit = 10240; |
2708 | q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; |
2709 | q->flow_mode = CAKE_FLOW_TRIPLE; |
2710 | |
2711 | q->rate_bps = 0; /* unlimited by default */ |
2712 | |
2713 | q->interval = 100000; /* 100ms default */ |
2714 | q->target = 5000; /* 5ms: codel RFC argues |
2715 | * for 5 to 10% of interval |
2716 | */ |
2717 | q->rate_flags |= CAKE_FLAG_SPLIT_GSO; |
2718 | q->cur_tin = 0; |
2719 | q->cur_flow = 0; |
2720 | |
2721 | qdisc_watchdog_init(wd: &q->watchdog, qdisc: sch); |
2722 | |
2723 | if (opt) { |
2724 | err = cake_change(sch, opt, extack); |
2725 | |
2726 | if (err) |
2727 | return err; |
2728 | } |
2729 | |
2730 | err = tcf_block_get(p_block: &q->block, p_filter_chain: &q->filter_list, q: sch, extack); |
2731 | if (err) |
2732 | return err; |
2733 | |
2734 | quantum_div[0] = ~0; |
2735 | for (i = 1; i <= CAKE_QUEUES; i++) |
2736 | quantum_div[i] = 65535 / i; |
2737 | |
2738 | q->tins = kvcalloc(CAKE_MAX_TINS, size: sizeof(struct cake_tin_data), |
2739 | GFP_KERNEL); |
2740 | if (!q->tins) |
2741 | return -ENOMEM; |
2742 | |
2743 | for (i = 0; i < CAKE_MAX_TINS; i++) { |
2744 | struct cake_tin_data *b = q->tins + i; |
2745 | |
2746 | INIT_LIST_HEAD(list: &b->new_flows); |
2747 | INIT_LIST_HEAD(list: &b->old_flows); |
2748 | INIT_LIST_HEAD(list: &b->decaying_flows); |
2749 | b->sparse_flow_count = 0; |
2750 | b->bulk_flow_count = 0; |
2751 | b->decaying_flow_count = 0; |
2752 | |
2753 | for (j = 0; j < CAKE_QUEUES; j++) { |
2754 | struct cake_flow *flow = b->flows + j; |
2755 | u32 k = j * CAKE_MAX_TINS + i; |
2756 | |
2757 | INIT_LIST_HEAD(list: &flow->flowchain); |
2758 | cobalt_vars_init(vars: &flow->cvars); |
2759 | |
2760 | q->overflow_heap[k].t = i; |
2761 | q->overflow_heap[k].b = j; |
2762 | b->overflow_idx[j] = k; |
2763 | } |
2764 | } |
2765 | |
2766 | cake_reconfigure(sch); |
2767 | q->avg_peak_bandwidth = q->rate_bps; |
2768 | q->min_netlen = ~0; |
2769 | q->min_adjlen = ~0; |
2770 | return 0; |
2771 | } |
2772 | |
2773 | static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) |
2774 | { |
2775 | struct cake_sched_data *q = qdisc_priv(sch); |
2776 | struct nlattr *opts; |
2777 | |
2778 | opts = nla_nest_start_noflag(skb, attrtype: TCA_OPTIONS); |
2779 | if (!opts) |
2780 | goto nla_put_failure; |
2781 | |
2782 | if (nla_put_u64_64bit(skb, attrtype: TCA_CAKE_BASE_RATE64, value: q->rate_bps, |
2783 | padattr: TCA_CAKE_PAD)) |
2784 | goto nla_put_failure; |
2785 | |
2786 | if (nla_put_u32(skb, attrtype: TCA_CAKE_FLOW_MODE, |
2787 | value: q->flow_mode & CAKE_FLOW_MASK)) |
2788 | goto nla_put_failure; |
2789 | |
2790 | if (nla_put_u32(skb, attrtype: TCA_CAKE_RTT, value: q->interval)) |
2791 | goto nla_put_failure; |
2792 | |
2793 | if (nla_put_u32(skb, attrtype: TCA_CAKE_TARGET, value: q->target)) |
2794 | goto nla_put_failure; |
2795 | |
2796 | if (nla_put_u32(skb, attrtype: TCA_CAKE_MEMORY, value: q->buffer_config_limit)) |
2797 | goto nla_put_failure; |
2798 | |
2799 | if (nla_put_u32(skb, attrtype: TCA_CAKE_AUTORATE, |
2800 | value: !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) |
2801 | goto nla_put_failure; |
2802 | |
2803 | if (nla_put_u32(skb, attrtype: TCA_CAKE_INGRESS, |
2804 | value: !!(q->rate_flags & CAKE_FLAG_INGRESS))) |
2805 | goto nla_put_failure; |
2806 | |
2807 | if (nla_put_u32(skb, attrtype: TCA_CAKE_ACK_FILTER, value: q->ack_filter)) |
2808 | goto nla_put_failure; |
2809 | |
2810 | if (nla_put_u32(skb, attrtype: TCA_CAKE_NAT, |
2811 | value: !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) |
2812 | goto nla_put_failure; |
2813 | |
2814 | if (nla_put_u32(skb, attrtype: TCA_CAKE_DIFFSERV_MODE, value: q->tin_mode)) |
2815 | goto nla_put_failure; |
2816 | |
2817 | if (nla_put_u32(skb, attrtype: TCA_CAKE_WASH, |
2818 | value: !!(q->rate_flags & CAKE_FLAG_WASH))) |
2819 | goto nla_put_failure; |
2820 | |
2821 | if (nla_put_u32(skb, attrtype: TCA_CAKE_OVERHEAD, value: q->rate_overhead)) |
2822 | goto nla_put_failure; |
2823 | |
2824 | if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) |
2825 | if (nla_put_u32(skb, attrtype: TCA_CAKE_RAW, value: 0)) |
2826 | goto nla_put_failure; |
2827 | |
2828 | if (nla_put_u32(skb, attrtype: TCA_CAKE_ATM, value: q->atm_mode)) |
2829 | goto nla_put_failure; |
2830 | |
2831 | if (nla_put_u32(skb, attrtype: TCA_CAKE_MPU, value: q->rate_mpu)) |
2832 | goto nla_put_failure; |
2833 | |
2834 | if (nla_put_u32(skb, attrtype: TCA_CAKE_SPLIT_GSO, |
2835 | value: !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) |
2836 | goto nla_put_failure; |
2837 | |
2838 | if (nla_put_u32(skb, attrtype: TCA_CAKE_FWMARK, value: q->fwmark_mask)) |
2839 | goto nla_put_failure; |
2840 | |
2841 | return nla_nest_end(skb, start: opts); |
2842 | |
2843 | nla_put_failure: |
2844 | return -1; |
2845 | } |
2846 | |
2847 | static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) |
2848 | { |
2849 | struct nlattr *stats = nla_nest_start_noflag(skb: d->skb, attrtype: TCA_STATS_APP); |
2850 | struct cake_sched_data *q = qdisc_priv(sch); |
2851 | struct nlattr *tstats, *ts; |
2852 | int i; |
2853 | |
2854 | if (!stats) |
2855 | return -1; |
2856 | |
2857 | #define PUT_STAT_U32(attr, data) do { \ |
2858 | if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ |
2859 | goto nla_put_failure; \ |
2860 | } while (0) |
2861 | #define PUT_STAT_U64(attr, data) do { \ |
2862 | if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ |
2863 | data, TCA_CAKE_STATS_PAD)) \ |
2864 | goto nla_put_failure; \ |
2865 | } while (0) |
2866 | |
2867 | PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); |
2868 | PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); |
2869 | PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); |
2870 | PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); |
2871 | PUT_STAT_U32(MAX_NETLEN, q->max_netlen); |
2872 | PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); |
2873 | PUT_STAT_U32(MIN_NETLEN, q->min_netlen); |
2874 | PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); |
2875 | |
2876 | #undef PUT_STAT_U32 |
2877 | #undef PUT_STAT_U64 |
2878 | |
2879 | tstats = nla_nest_start_noflag(skb: d->skb, attrtype: TCA_CAKE_STATS_TIN_STATS); |
2880 | if (!tstats) |
2881 | goto nla_put_failure; |
2882 | |
2883 | #define PUT_TSTAT_U32(attr, data) do { \ |
2884 | if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ |
2885 | goto nla_put_failure; \ |
2886 | } while (0) |
2887 | #define PUT_TSTAT_U64(attr, data) do { \ |
2888 | if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ |
2889 | data, TCA_CAKE_TIN_STATS_PAD)) \ |
2890 | goto nla_put_failure; \ |
2891 | } while (0) |
2892 | |
2893 | for (i = 0; i < q->tin_cnt; i++) { |
2894 | struct cake_tin_data *b = &q->tins[q->tin_order[i]]; |
2895 | |
2896 | ts = nla_nest_start_noflag(skb: d->skb, attrtype: i + 1); |
2897 | if (!ts) |
2898 | goto nla_put_failure; |
2899 | |
2900 | PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); |
2901 | PUT_TSTAT_U64(SENT_BYTES64, b->bytes); |
2902 | PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); |
2903 | |
2904 | PUT_TSTAT_U32(TARGET_US, |
2905 | ktime_to_us(ns_to_ktime(b->cparams.target))); |
2906 | PUT_TSTAT_U32(INTERVAL_US, |
2907 | ktime_to_us(ns_to_ktime(b->cparams.interval))); |
2908 | |
2909 | PUT_TSTAT_U32(SENT_PACKETS, b->packets); |
2910 | PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); |
2911 | PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); |
2912 | PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); |
2913 | |
2914 | PUT_TSTAT_U32(PEAK_DELAY_US, |
2915 | ktime_to_us(ns_to_ktime(b->peak_delay))); |
2916 | PUT_TSTAT_U32(AVG_DELAY_US, |
2917 | ktime_to_us(ns_to_ktime(b->avge_delay))); |
2918 | PUT_TSTAT_U32(BASE_DELAY_US, |
2919 | ktime_to_us(ns_to_ktime(b->base_delay))); |
2920 | |
2921 | PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); |
2922 | PUT_TSTAT_U32(WAY_MISSES, b->way_misses); |
2923 | PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); |
2924 | |
2925 | PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + |
2926 | b->decaying_flow_count); |
2927 | PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); |
2928 | PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); |
2929 | PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); |
2930 | |
2931 | PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); |
2932 | nla_nest_end(skb: d->skb, start: ts); |
2933 | } |
2934 | |
2935 | #undef PUT_TSTAT_U32 |
2936 | #undef PUT_TSTAT_U64 |
2937 | |
2938 | nla_nest_end(skb: d->skb, start: tstats); |
2939 | return nla_nest_end(skb: d->skb, start: stats); |
2940 | |
2941 | nla_put_failure: |
2942 | nla_nest_cancel(skb: d->skb, start: stats); |
2943 | return -1; |
2944 | } |
2945 | |
2946 | static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) |
2947 | { |
2948 | return NULL; |
2949 | } |
2950 | |
2951 | static unsigned long cake_find(struct Qdisc *sch, u32 classid) |
2952 | { |
2953 | return 0; |
2954 | } |
2955 | |
2956 | static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, |
2957 | u32 classid) |
2958 | { |
2959 | return 0; |
2960 | } |
2961 | |
2962 | static void cake_unbind(struct Qdisc *q, unsigned long cl) |
2963 | { |
2964 | } |
2965 | |
2966 | static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, |
2967 | struct netlink_ext_ack *extack) |
2968 | { |
2969 | struct cake_sched_data *q = qdisc_priv(sch); |
2970 | |
2971 | if (cl) |
2972 | return NULL; |
2973 | return q->block; |
2974 | } |
2975 | |
2976 | static int cake_dump_class(struct Qdisc *sch, unsigned long cl, |
2977 | struct sk_buff *skb, struct tcmsg *tcm) |
2978 | { |
2979 | tcm->tcm_handle |= TC_H_MIN(cl); |
2980 | return 0; |
2981 | } |
2982 | |
2983 | static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, |
2984 | struct gnet_dump *d) |
2985 | { |
2986 | struct cake_sched_data *q = qdisc_priv(sch); |
2987 | const struct cake_flow *flow = NULL; |
2988 | struct gnet_stats_queue qs = { 0 }; |
2989 | struct nlattr *stats; |
2990 | u32 idx = cl - 1; |
2991 | |
2992 | if (idx < CAKE_QUEUES * q->tin_cnt) { |
2993 | const struct cake_tin_data *b = \ |
2994 | &q->tins[q->tin_order[idx / CAKE_QUEUES]]; |
2995 | const struct sk_buff *skb; |
2996 | |
2997 | flow = &b->flows[idx % CAKE_QUEUES]; |
2998 | |
2999 | if (flow->head) { |
3000 | sch_tree_lock(q: sch); |
3001 | skb = flow->head; |
3002 | while (skb) { |
3003 | qs.qlen++; |
3004 | skb = skb->next; |
3005 | } |
3006 | sch_tree_unlock(q: sch); |
3007 | } |
3008 | qs.backlog = b->backlogs[idx % CAKE_QUEUES]; |
3009 | qs.drops = flow->dropped; |
3010 | } |
3011 | if (gnet_stats_copy_queue(d, NULL, q: &qs, qlen: qs.qlen) < 0) |
3012 | return -1; |
3013 | if (flow) { |
3014 | ktime_t now = ktime_get(); |
3015 | |
3016 | stats = nla_nest_start_noflag(skb: d->skb, attrtype: TCA_STATS_APP); |
3017 | if (!stats) |
3018 | return -1; |
3019 | |
3020 | #define PUT_STAT_U32(attr, data) do { \ |
3021 | if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ |
3022 | goto nla_put_failure; \ |
3023 | } while (0) |
3024 | #define PUT_STAT_S32(attr, data) do { \ |
3025 | if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ |
3026 | goto nla_put_failure; \ |
3027 | } while (0) |
3028 | |
3029 | PUT_STAT_S32(DEFICIT, flow->deficit); |
3030 | PUT_STAT_U32(DROPPING, flow->cvars.dropping); |
3031 | PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); |
3032 | PUT_STAT_U32(P_DROP, flow->cvars.p_drop); |
3033 | if (flow->cvars.p_drop) { |
3034 | PUT_STAT_S32(BLUE_TIMER_US, |
3035 | ktime_to_us( |
3036 | ktime_sub(now, |
3037 | flow->cvars.blue_timer))); |
3038 | } |
3039 | if (flow->cvars.dropping) { |
3040 | PUT_STAT_S32(DROP_NEXT_US, |
3041 | ktime_to_us( |
3042 | ktime_sub(now, |
3043 | flow->cvars.drop_next))); |
3044 | } |
3045 | |
3046 | if (nla_nest_end(skb: d->skb, start: stats) < 0) |
3047 | return -1; |
3048 | } |
3049 | |
3050 | return 0; |
3051 | |
3052 | nla_put_failure: |
3053 | nla_nest_cancel(skb: d->skb, start: stats); |
3054 | return -1; |
3055 | } |
3056 | |
3057 | static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) |
3058 | { |
3059 | struct cake_sched_data *q = qdisc_priv(sch); |
3060 | unsigned int i, j; |
3061 | |
3062 | if (arg->stop) |
3063 | return; |
3064 | |
3065 | for (i = 0; i < q->tin_cnt; i++) { |
3066 | struct cake_tin_data *b = &q->tins[q->tin_order[i]]; |
3067 | |
3068 | for (j = 0; j < CAKE_QUEUES; j++) { |
3069 | if (list_empty(head: &b->flows[j].flowchain)) { |
3070 | arg->count++; |
3071 | continue; |
3072 | } |
3073 | if (!tc_qdisc_stats_dump(sch, cl: i * CAKE_QUEUES + j + 1, |
3074 | arg)) |
3075 | break; |
3076 | } |
3077 | } |
3078 | } |
3079 | |
3080 | static const struct Qdisc_class_ops cake_class_ops = { |
3081 | .leaf = cake_leaf, |
3082 | .find = cake_find, |
3083 | .tcf_block = cake_tcf_block, |
3084 | .bind_tcf = cake_bind, |
3085 | .unbind_tcf = cake_unbind, |
3086 | .dump = cake_dump_class, |
3087 | .dump_stats = cake_dump_class_stats, |
3088 | .walk = cake_walk, |
3089 | }; |
3090 | |
3091 | static struct Qdisc_ops cake_qdisc_ops __read_mostly = { |
3092 | .cl_ops = &cake_class_ops, |
3093 | .id = "cake" , |
3094 | .priv_size = sizeof(struct cake_sched_data), |
3095 | .enqueue = cake_enqueue, |
3096 | .dequeue = cake_dequeue, |
3097 | .peek = qdisc_peek_dequeued, |
3098 | .init = cake_init, |
3099 | .reset = cake_reset, |
3100 | .destroy = cake_destroy, |
3101 | .change = cake_change, |
3102 | .dump = cake_dump, |
3103 | .dump_stats = cake_dump_stats, |
3104 | .owner = THIS_MODULE, |
3105 | }; |
3106 | |
3107 | static int __init cake_module_init(void) |
3108 | { |
3109 | return register_qdisc(qops: &cake_qdisc_ops); |
3110 | } |
3111 | |
3112 | static void __exit cake_module_exit(void) |
3113 | { |
3114 | unregister_qdisc(qops: &cake_qdisc_ops); |
3115 | } |
3116 | |
3117 | module_init(cake_module_init) |
3118 | module_exit(cake_module_exit) |
3119 | MODULE_AUTHOR("Jonathan Morton" ); |
3120 | MODULE_LICENSE("Dual BSD/GPL" ); |
3121 | MODULE_DESCRIPTION("The CAKE shaper." ); |
3122 | |