1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) |
4 | * |
5 | * Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com> |
6 | * |
7 | * Meant to be mostly used for locally generated traffic : |
8 | * Fast classification depends on skb->sk being set before reaching us. |
9 | * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. |
10 | * All packets belonging to a socket are considered as a 'flow'. |
11 | * |
12 | * Flows are dynamically allocated and stored in a hash table of RB trees |
13 | * They are also part of one Round Robin 'queues' (new or old flows) |
14 | * |
15 | * Burst avoidance (aka pacing) capability : |
16 | * |
17 | * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a |
18 | * bunch of packets, and this packet scheduler adds delay between |
19 | * packets to respect rate limitation. |
20 | * |
21 | * enqueue() : |
22 | * - lookup one RB tree (out of 1024 or more) to find the flow. |
23 | * If non existent flow, create it, add it to the tree. |
24 | * Add skb to the per flow list of skb (fifo). |
25 | * - Use a special fifo for high prio packets |
26 | * |
27 | * dequeue() : serves flows in Round Robin |
28 | * Note : When a flow becomes empty, we do not immediately remove it from |
29 | * rb trees, for performance reasons (its expected to send additional packets, |
30 | * or SLAB cache will reuse socket for another flow) |
31 | */ |
32 | |
33 | #include <linux/module.h> |
34 | #include <linux/types.h> |
35 | #include <linux/kernel.h> |
36 | #include <linux/jiffies.h> |
37 | #include <linux/string.h> |
38 | #include <linux/in.h> |
39 | #include <linux/errno.h> |
40 | #include <linux/init.h> |
41 | #include <linux/skbuff.h> |
42 | #include <linux/slab.h> |
43 | #include <linux/rbtree.h> |
44 | #include <linux/hash.h> |
45 | #include <linux/prefetch.h> |
46 | #include <linux/vmalloc.h> |
47 | #include <net/netlink.h> |
48 | #include <net/pkt_sched.h> |
49 | #include <net/sock.h> |
50 | #include <net/tcp_states.h> |
51 | #include <net/tcp.h> |
52 | |
53 | struct fq_skb_cb { |
54 | u64 time_to_send; |
55 | u8 band; |
56 | }; |
57 | |
58 | static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) |
59 | { |
60 | qdisc_cb_private_validate(skb, sz: sizeof(struct fq_skb_cb)); |
61 | return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; |
62 | } |
63 | |
64 | /* |
65 | * Per flow structure, dynamically allocated. |
66 | * If packets have monotically increasing time_to_send, they are placed in O(1) |
67 | * in linear list (head,tail), otherwise are placed in a rbtree (t_root). |
68 | */ |
69 | struct fq_flow { |
70 | /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ |
71 | struct rb_root t_root; |
72 | struct sk_buff *head; /* list of skbs for this flow : first skb */ |
73 | union { |
74 | struct sk_buff *tail; /* last skb in the list */ |
75 | unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ |
76 | }; |
77 | union { |
78 | struct rb_node fq_node; /* anchor in fq_root[] trees */ |
79 | /* Following field is only used for q->internal, |
80 | * because q->internal is not hashed in fq_root[] |
81 | */ |
82 | u64 stat_fastpath_packets; |
83 | }; |
84 | struct sock *sk; |
85 | u32 socket_hash; /* sk_hash */ |
86 | int qlen; /* number of packets in flow queue */ |
87 | |
88 | /* Second cache line */ |
89 | int credit; |
90 | int band; |
91 | struct fq_flow *next; /* next pointer in RR lists */ |
92 | |
93 | struct rb_node rate_node; /* anchor in q->delayed tree */ |
94 | u64 time_next_packet; |
95 | }; |
96 | |
97 | struct fq_flow_head { |
98 | struct fq_flow *first; |
99 | struct fq_flow *last; |
100 | }; |
101 | |
102 | struct fq_perband_flows { |
103 | struct fq_flow_head new_flows; |
104 | struct fq_flow_head old_flows; |
105 | int credit; |
106 | int quantum; /* based on band nr : 576KB, 192KB, 64KB */ |
107 | }; |
108 | |
109 | struct fq_sched_data { |
110 | /* Read mostly cache line */ |
111 | |
112 | u32 quantum; |
113 | u32 initial_quantum; |
114 | u32 flow_refill_delay; |
115 | u32 flow_plimit; /* max packets per flow */ |
116 | unsigned long flow_max_rate; /* optional max rate per flow */ |
117 | u64 ce_threshold; |
118 | u64 horizon; /* horizon in ns */ |
119 | u32 orphan_mask; /* mask for orphaned skb */ |
120 | u32 low_rate_threshold; |
121 | struct rb_root *fq_root; |
122 | u8 rate_enable; |
123 | u8 fq_trees_log; |
124 | u8 horizon_drop; |
125 | u8 prio2band[(TC_PRIO_MAX + 1) >> 2]; |
126 | u32 timer_slack; /* hrtimer slack in ns */ |
127 | |
128 | /* Read/Write fields. */ |
129 | |
130 | unsigned int band_nr; /* band being serviced in fq_dequeue() */ |
131 | |
132 | struct fq_perband_flows band_flows[FQ_BANDS]; |
133 | |
134 | struct fq_flow internal; /* fastpath queue. */ |
135 | struct rb_root delayed; /* for rate limited flows */ |
136 | u64 time_next_delayed_flow; |
137 | unsigned long unthrottle_latency_ns; |
138 | |
139 | u32 band_pkt_count[FQ_BANDS]; |
140 | u32 flows; |
141 | u32 inactive_flows; /* Flows with no packet to send. */ |
142 | u32 throttled_flows; |
143 | |
144 | u64 stat_throttled; |
145 | struct qdisc_watchdog watchdog; |
146 | u64 stat_gc_flows; |
147 | |
148 | /* Seldom used fields. */ |
149 | |
150 | u64 stat_band_drops[FQ_BANDS]; |
151 | u64 stat_ce_mark; |
152 | u64 stat_horizon_drops; |
153 | u64 stat_horizon_caps; |
154 | u64 stat_flows_plimit; |
155 | u64 stat_pkts_too_long; |
156 | u64 stat_allocation_errors; |
157 | }; |
158 | |
159 | /* return the i-th 2-bit value ("crumb") */ |
160 | static u8 fq_prio2band(const u8 *prio2band, unsigned int prio) |
161 | { |
162 | return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3; |
163 | } |
164 | |
165 | /* |
166 | * f->tail and f->age share the same location. |
167 | * We can use the low order bit to differentiate if this location points |
168 | * to a sk_buff or contains a jiffies value, if we force this value to be odd. |
169 | * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 |
170 | */ |
171 | static void fq_flow_set_detached(struct fq_flow *f) |
172 | { |
173 | f->age = jiffies | 1UL; |
174 | } |
175 | |
176 | static bool fq_flow_is_detached(const struct fq_flow *f) |
177 | { |
178 | return !!(f->age & 1UL); |
179 | } |
180 | |
181 | /* special value to mark a throttled flow (not on old/new list) */ |
182 | static struct fq_flow throttled; |
183 | |
184 | static bool fq_flow_is_throttled(const struct fq_flow *f) |
185 | { |
186 | return f->next == &throttled; |
187 | } |
188 | |
189 | enum new_flow { |
190 | NEW_FLOW, |
191 | OLD_FLOW |
192 | }; |
193 | |
194 | static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow, |
195 | enum new_flow list_sel) |
196 | { |
197 | struct fq_perband_flows *pband = &q->band_flows[flow->band]; |
198 | struct fq_flow_head *head = (list_sel == NEW_FLOW) ? |
199 | &pband->new_flows : |
200 | &pband->old_flows; |
201 | |
202 | if (head->first) |
203 | head->last->next = flow; |
204 | else |
205 | head->first = flow; |
206 | head->last = flow; |
207 | flow->next = NULL; |
208 | } |
209 | |
210 | static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f) |
211 | { |
212 | rb_erase(&f->rate_node, &q->delayed); |
213 | q->throttled_flows--; |
214 | fq_flow_add_tail(q, flow: f, list_sel: OLD_FLOW); |
215 | } |
216 | |
217 | static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) |
218 | { |
219 | struct rb_node **p = &q->delayed.rb_node, *parent = NULL; |
220 | |
221 | while (*p) { |
222 | struct fq_flow *aux; |
223 | |
224 | parent = *p; |
225 | aux = rb_entry(parent, struct fq_flow, rate_node); |
226 | if (f->time_next_packet >= aux->time_next_packet) |
227 | p = &parent->rb_right; |
228 | else |
229 | p = &parent->rb_left; |
230 | } |
231 | rb_link_node(node: &f->rate_node, parent, rb_link: p); |
232 | rb_insert_color(&f->rate_node, &q->delayed); |
233 | q->throttled_flows++; |
234 | q->stat_throttled++; |
235 | |
236 | f->next = &throttled; |
237 | if (q->time_next_delayed_flow > f->time_next_packet) |
238 | q->time_next_delayed_flow = f->time_next_packet; |
239 | } |
240 | |
241 | |
242 | static struct kmem_cache *fq_flow_cachep __read_mostly; |
243 | |
244 | |
245 | /* limit number of collected flows per round */ |
246 | #define FQ_GC_MAX 8 |
247 | #define FQ_GC_AGE (3*HZ) |
248 | |
249 | static bool fq_gc_candidate(const struct fq_flow *f) |
250 | { |
251 | return fq_flow_is_detached(f) && |
252 | time_after(jiffies, f->age + FQ_GC_AGE); |
253 | } |
254 | |
255 | static void fq_gc(struct fq_sched_data *q, |
256 | struct rb_root *root, |
257 | struct sock *sk) |
258 | { |
259 | struct rb_node **p, *parent; |
260 | void *tofree[FQ_GC_MAX]; |
261 | struct fq_flow *f; |
262 | int i, fcnt = 0; |
263 | |
264 | p = &root->rb_node; |
265 | parent = NULL; |
266 | while (*p) { |
267 | parent = *p; |
268 | |
269 | f = rb_entry(parent, struct fq_flow, fq_node); |
270 | if (f->sk == sk) |
271 | break; |
272 | |
273 | if (fq_gc_candidate(f)) { |
274 | tofree[fcnt++] = f; |
275 | if (fcnt == FQ_GC_MAX) |
276 | break; |
277 | } |
278 | |
279 | if (f->sk > sk) |
280 | p = &parent->rb_right; |
281 | else |
282 | p = &parent->rb_left; |
283 | } |
284 | |
285 | if (!fcnt) |
286 | return; |
287 | |
288 | for (i = fcnt; i > 0; ) { |
289 | f = tofree[--i]; |
290 | rb_erase(&f->fq_node, root); |
291 | } |
292 | q->flows -= fcnt; |
293 | q->inactive_flows -= fcnt; |
294 | q->stat_gc_flows += fcnt; |
295 | |
296 | kmem_cache_free_bulk(s: fq_flow_cachep, size: fcnt, p: tofree); |
297 | } |
298 | |
299 | /* Fast path can be used if : |
300 | * 1) Packet tstamp is in the past. |
301 | * 2) FQ qlen == 0 OR |
302 | * (no flow is currently eligible for transmit, |
303 | * AND fast path queue has less than 8 packets) |
304 | * 3) No SO_MAX_PACING_RATE on the socket (if any). |
305 | * 4) No @maxrate attribute on this qdisc, |
306 | * |
307 | * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure. |
308 | */ |
309 | static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb, |
310 | u64 now) |
311 | { |
312 | const struct fq_sched_data *q = qdisc_priv(sch); |
313 | const struct sock *sk; |
314 | |
315 | if (fq_skb_cb(skb)->time_to_send > now) |
316 | return false; |
317 | |
318 | if (sch->q.qlen != 0) { |
319 | /* Even if some packets are stored in this qdisc, |
320 | * we can still enable fast path if all of them are |
321 | * scheduled in the future (ie no flows are eligible) |
322 | * or in the fast path queue. |
323 | */ |
324 | if (q->flows != q->inactive_flows + q->throttled_flows) |
325 | return false; |
326 | |
327 | /* Do not allow fast path queue to explode, we want Fair Queue mode |
328 | * under pressure. |
329 | */ |
330 | if (q->internal.qlen >= 8) |
331 | return false; |
332 | } |
333 | |
334 | sk = skb->sk; |
335 | if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) && |
336 | sk->sk_max_pacing_rate != ~0UL) |
337 | return false; |
338 | |
339 | if (q->flow_max_rate != ~0UL) |
340 | return false; |
341 | |
342 | return true; |
343 | } |
344 | |
345 | static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb, |
346 | u64 now) |
347 | { |
348 | struct fq_sched_data *q = qdisc_priv(sch); |
349 | struct rb_node **p, *parent; |
350 | struct sock *sk = skb->sk; |
351 | struct rb_root *root; |
352 | struct fq_flow *f; |
353 | |
354 | /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket |
355 | * or a listener (SYNCOOKIE mode) |
356 | * 1) request sockets are not full blown, |
357 | * they do not contain sk_pacing_rate |
358 | * 2) They are not part of a 'flow' yet |
359 | * 3) We do not want to rate limit them (eg SYNFLOOD attack), |
360 | * especially if the listener set SO_MAX_PACING_RATE |
361 | * 4) We pretend they are orphaned |
362 | */ |
363 | if (!sk || sk_listener(sk)) { |
364 | unsigned long hash = skb_get_hash(skb) & q->orphan_mask; |
365 | |
366 | /* By forcing low order bit to 1, we make sure to not |
367 | * collide with a local flow (socket pointers are word aligned) |
368 | */ |
369 | sk = (struct sock *)((hash << 1) | 1UL); |
370 | skb_orphan(skb); |
371 | } else if (sk->sk_state == TCP_CLOSE) { |
372 | unsigned long hash = skb_get_hash(skb) & q->orphan_mask; |
373 | /* |
374 | * Sockets in TCP_CLOSE are non connected. |
375 | * Typical use case is UDP sockets, they can send packets |
376 | * with sendto() to many different destinations. |
377 | * We probably could use a generic bit advertising |
378 | * non connected sockets, instead of sk_state == TCP_CLOSE, |
379 | * if we care enough. |
380 | */ |
381 | sk = (struct sock *)((hash << 1) | 1UL); |
382 | } |
383 | |
384 | if (fq_fastpath_check(sch, skb, now)) { |
385 | q->internal.stat_fastpath_packets++; |
386 | if (skb->sk == sk && q->rate_enable && |
387 | READ_ONCE(sk->sk_pacing_status) != SK_PACING_FQ) |
388 | smp_store_release(&sk->sk_pacing_status, |
389 | SK_PACING_FQ); |
390 | return &q->internal; |
391 | } |
392 | |
393 | root = &q->fq_root[hash_ptr(ptr: sk, bits: q->fq_trees_log)]; |
394 | |
395 | fq_gc(q, root, sk); |
396 | |
397 | p = &root->rb_node; |
398 | parent = NULL; |
399 | while (*p) { |
400 | parent = *p; |
401 | |
402 | f = rb_entry(parent, struct fq_flow, fq_node); |
403 | if (f->sk == sk) { |
404 | /* socket might have been reallocated, so check |
405 | * if its sk_hash is the same. |
406 | * It not, we need to refill credit with |
407 | * initial quantum |
408 | */ |
409 | if (unlikely(skb->sk == sk && |
410 | f->socket_hash != sk->sk_hash)) { |
411 | f->credit = q->initial_quantum; |
412 | f->socket_hash = sk->sk_hash; |
413 | if (q->rate_enable) |
414 | smp_store_release(&sk->sk_pacing_status, |
415 | SK_PACING_FQ); |
416 | if (fq_flow_is_throttled(f)) |
417 | fq_flow_unset_throttled(q, f); |
418 | f->time_next_packet = 0ULL; |
419 | } |
420 | return f; |
421 | } |
422 | if (f->sk > sk) |
423 | p = &parent->rb_right; |
424 | else |
425 | p = &parent->rb_left; |
426 | } |
427 | |
428 | f = kmem_cache_zalloc(k: fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); |
429 | if (unlikely(!f)) { |
430 | q->stat_allocation_errors++; |
431 | return &q->internal; |
432 | } |
433 | /* f->t_root is already zeroed after kmem_cache_zalloc() */ |
434 | |
435 | fq_flow_set_detached(f); |
436 | f->sk = sk; |
437 | if (skb->sk == sk) { |
438 | f->socket_hash = sk->sk_hash; |
439 | if (q->rate_enable) |
440 | smp_store_release(&sk->sk_pacing_status, |
441 | SK_PACING_FQ); |
442 | } |
443 | f->credit = q->initial_quantum; |
444 | |
445 | rb_link_node(node: &f->fq_node, parent, rb_link: p); |
446 | rb_insert_color(&f->fq_node, root); |
447 | |
448 | q->flows++; |
449 | q->inactive_flows++; |
450 | return f; |
451 | } |
452 | |
453 | static struct sk_buff *fq_peek(struct fq_flow *flow) |
454 | { |
455 | struct sk_buff *skb = skb_rb_first(&flow->t_root); |
456 | struct sk_buff *head = flow->head; |
457 | |
458 | if (!skb) |
459 | return head; |
460 | |
461 | if (!head) |
462 | return skb; |
463 | |
464 | if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(skb: head)->time_to_send) |
465 | return skb; |
466 | return head; |
467 | } |
468 | |
469 | static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, |
470 | struct sk_buff *skb) |
471 | { |
472 | if (skb == flow->head) { |
473 | flow->head = skb->next; |
474 | } else { |
475 | rb_erase(&skb->rbnode, &flow->t_root); |
476 | skb->dev = qdisc_dev(qdisc: sch); |
477 | } |
478 | } |
479 | |
480 | /* Remove one skb from flow queue. |
481 | * This skb must be the return value of prior fq_peek(). |
482 | */ |
483 | static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, |
484 | struct sk_buff *skb) |
485 | { |
486 | fq_erase_head(sch, flow, skb); |
487 | skb_mark_not_on_list(skb); |
488 | qdisc_qstats_backlog_dec(sch, skb); |
489 | sch->q.qlen--; |
490 | } |
491 | |
492 | static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) |
493 | { |
494 | struct rb_node **p, *parent; |
495 | struct sk_buff *head, *aux; |
496 | |
497 | head = flow->head; |
498 | if (!head || |
499 | fq_skb_cb(skb)->time_to_send >= fq_skb_cb(skb: flow->tail)->time_to_send) { |
500 | if (!head) |
501 | flow->head = skb; |
502 | else |
503 | flow->tail->next = skb; |
504 | flow->tail = skb; |
505 | skb->next = NULL; |
506 | return; |
507 | } |
508 | |
509 | p = &flow->t_root.rb_node; |
510 | parent = NULL; |
511 | |
512 | while (*p) { |
513 | parent = *p; |
514 | aux = rb_to_skb(parent); |
515 | if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(skb: aux)->time_to_send) |
516 | p = &parent->rb_right; |
517 | else |
518 | p = &parent->rb_left; |
519 | } |
520 | rb_link_node(node: &skb->rbnode, parent, rb_link: p); |
521 | rb_insert_color(&skb->rbnode, &flow->t_root); |
522 | } |
523 | |
524 | static bool fq_packet_beyond_horizon(const struct sk_buff *skb, |
525 | const struct fq_sched_data *q, u64 now) |
526 | { |
527 | return unlikely((s64)skb->tstamp > (s64)(now + q->horizon)); |
528 | } |
529 | |
530 | static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, |
531 | struct sk_buff **to_free) |
532 | { |
533 | struct fq_sched_data *q = qdisc_priv(sch); |
534 | struct fq_flow *f; |
535 | u64 now; |
536 | u8 band; |
537 | |
538 | band = fq_prio2band(prio2band: q->prio2band, prio: skb->priority & TC_PRIO_MAX); |
539 | if (unlikely(q->band_pkt_count[band] >= sch->limit)) { |
540 | q->stat_band_drops[band]++; |
541 | return qdisc_drop(skb, sch, to_free); |
542 | } |
543 | |
544 | now = ktime_get_ns(); |
545 | if (!skb->tstamp) { |
546 | fq_skb_cb(skb)->time_to_send = now; |
547 | } else { |
548 | /* Check if packet timestamp is too far in the future. */ |
549 | if (fq_packet_beyond_horizon(skb, q, now)) { |
550 | if (q->horizon_drop) { |
551 | q->stat_horizon_drops++; |
552 | return qdisc_drop(skb, sch, to_free); |
553 | } |
554 | q->stat_horizon_caps++; |
555 | skb->tstamp = now + q->horizon; |
556 | } |
557 | fq_skb_cb(skb)->time_to_send = skb->tstamp; |
558 | } |
559 | |
560 | f = fq_classify(sch, skb, now); |
561 | |
562 | if (f != &q->internal) { |
563 | if (unlikely(f->qlen >= q->flow_plimit)) { |
564 | q->stat_flows_plimit++; |
565 | return qdisc_drop(skb, sch, to_free); |
566 | } |
567 | |
568 | if (fq_flow_is_detached(f)) { |
569 | fq_flow_add_tail(q, flow: f, list_sel: NEW_FLOW); |
570 | if (time_after(jiffies, f->age + q->flow_refill_delay)) |
571 | f->credit = max_t(u32, f->credit, q->quantum); |
572 | } |
573 | |
574 | f->band = band; |
575 | q->band_pkt_count[band]++; |
576 | fq_skb_cb(skb)->band = band; |
577 | if (f->qlen == 0) |
578 | q->inactive_flows--; |
579 | } |
580 | |
581 | f->qlen++; |
582 | /* Note: this overwrites f->age */ |
583 | flow_queue_add(flow: f, skb); |
584 | |
585 | qdisc_qstats_backlog_inc(sch, skb); |
586 | sch->q.qlen++; |
587 | |
588 | return NET_XMIT_SUCCESS; |
589 | } |
590 | |
591 | static void fq_check_throttled(struct fq_sched_data *q, u64 now) |
592 | { |
593 | unsigned long sample; |
594 | struct rb_node *p; |
595 | |
596 | if (q->time_next_delayed_flow > now) |
597 | return; |
598 | |
599 | /* Update unthrottle latency EWMA. |
600 | * This is cheap and can help diagnosing timer/latency problems. |
601 | */ |
602 | sample = (unsigned long)(now - q->time_next_delayed_flow); |
603 | q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; |
604 | q->unthrottle_latency_ns += sample >> 3; |
605 | |
606 | q->time_next_delayed_flow = ~0ULL; |
607 | while ((p = rb_first(&q->delayed)) != NULL) { |
608 | struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node); |
609 | |
610 | if (f->time_next_packet > now) { |
611 | q->time_next_delayed_flow = f->time_next_packet; |
612 | break; |
613 | } |
614 | fq_flow_unset_throttled(q, f); |
615 | } |
616 | } |
617 | |
618 | static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband) |
619 | { |
620 | if (pband->credit <= 0) |
621 | return NULL; |
622 | |
623 | if (pband->new_flows.first) |
624 | return &pband->new_flows; |
625 | |
626 | return pband->old_flows.first ? &pband->old_flows : NULL; |
627 | } |
628 | |
629 | static struct sk_buff *fq_dequeue(struct Qdisc *sch) |
630 | { |
631 | struct fq_sched_data *q = qdisc_priv(sch); |
632 | struct fq_perband_flows *pband; |
633 | struct fq_flow_head *head; |
634 | struct sk_buff *skb; |
635 | struct fq_flow *f; |
636 | unsigned long rate; |
637 | int retry; |
638 | u32 plen; |
639 | u64 now; |
640 | |
641 | if (!sch->q.qlen) |
642 | return NULL; |
643 | |
644 | skb = fq_peek(flow: &q->internal); |
645 | if (unlikely(skb)) { |
646 | q->internal.qlen--; |
647 | fq_dequeue_skb(sch, flow: &q->internal, skb); |
648 | goto out; |
649 | } |
650 | |
651 | now = ktime_get_ns(); |
652 | fq_check_throttled(q, now); |
653 | retry = 0; |
654 | pband = &q->band_flows[q->band_nr]; |
655 | begin: |
656 | head = fq_pband_head_select(pband); |
657 | if (!head) { |
658 | while (++retry <= FQ_BANDS) { |
659 | if (++q->band_nr == FQ_BANDS) |
660 | q->band_nr = 0; |
661 | pband = &q->band_flows[q->band_nr]; |
662 | pband->credit = min(pband->credit + pband->quantum, |
663 | pband->quantum); |
664 | goto begin; |
665 | } |
666 | if (q->time_next_delayed_flow != ~0ULL) |
667 | qdisc_watchdog_schedule_range_ns(wd: &q->watchdog, |
668 | expires: q->time_next_delayed_flow, |
669 | delta_ns: q->timer_slack); |
670 | return NULL; |
671 | } |
672 | f = head->first; |
673 | retry = 0; |
674 | if (f->credit <= 0) { |
675 | f->credit += q->quantum; |
676 | head->first = f->next; |
677 | fq_flow_add_tail(q, flow: f, list_sel: OLD_FLOW); |
678 | goto begin; |
679 | } |
680 | |
681 | skb = fq_peek(flow: f); |
682 | if (skb) { |
683 | u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, |
684 | f->time_next_packet); |
685 | |
686 | if (now < time_next_packet) { |
687 | head->first = f->next; |
688 | f->time_next_packet = time_next_packet; |
689 | fq_flow_set_throttled(q, f); |
690 | goto begin; |
691 | } |
692 | prefetch(&skb->end); |
693 | if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { |
694 | INET_ECN_set_ce(skb); |
695 | q->stat_ce_mark++; |
696 | } |
697 | if (--f->qlen == 0) |
698 | q->inactive_flows++; |
699 | q->band_pkt_count[fq_skb_cb(skb)->band]--; |
700 | fq_dequeue_skb(sch, flow: f, skb); |
701 | } else { |
702 | head->first = f->next; |
703 | /* force a pass through old_flows to prevent starvation */ |
704 | if (head == &pband->new_flows) { |
705 | fq_flow_add_tail(q, flow: f, list_sel: OLD_FLOW); |
706 | } else { |
707 | fq_flow_set_detached(f); |
708 | } |
709 | goto begin; |
710 | } |
711 | plen = qdisc_pkt_len(skb); |
712 | f->credit -= plen; |
713 | pband->credit -= plen; |
714 | |
715 | if (!q->rate_enable) |
716 | goto out; |
717 | |
718 | rate = q->flow_max_rate; |
719 | |
720 | /* If EDT time was provided for this skb, we need to |
721 | * update f->time_next_packet only if this qdisc enforces |
722 | * a flow max rate. |
723 | */ |
724 | if (!skb->tstamp) { |
725 | if (skb->sk) |
726 | rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate); |
727 | |
728 | if (rate <= q->low_rate_threshold) { |
729 | f->credit = 0; |
730 | } else { |
731 | plen = max(plen, q->quantum); |
732 | if (f->credit > 0) |
733 | goto out; |
734 | } |
735 | } |
736 | if (rate != ~0UL) { |
737 | u64 len = (u64)plen * NSEC_PER_SEC; |
738 | |
739 | if (likely(rate)) |
740 | len = div64_ul(len, rate); |
741 | /* Since socket rate can change later, |
742 | * clamp the delay to 1 second. |
743 | * Really, providers of too big packets should be fixed ! |
744 | */ |
745 | if (unlikely(len > NSEC_PER_SEC)) { |
746 | len = NSEC_PER_SEC; |
747 | q->stat_pkts_too_long++; |
748 | } |
749 | /* Account for schedule/timers drifts. |
750 | * f->time_next_packet was set when prior packet was sent, |
751 | * and current time (@now) can be too late by tens of us. |
752 | */ |
753 | if (f->time_next_packet) |
754 | len -= min(len/2, now - f->time_next_packet); |
755 | f->time_next_packet = now + len; |
756 | } |
757 | out: |
758 | qdisc_bstats_update(sch, skb); |
759 | return skb; |
760 | } |
761 | |
762 | static void fq_flow_purge(struct fq_flow *flow) |
763 | { |
764 | struct rb_node *p = rb_first(&flow->t_root); |
765 | |
766 | while (p) { |
767 | struct sk_buff *skb = rb_to_skb(p); |
768 | |
769 | p = rb_next(p); |
770 | rb_erase(&skb->rbnode, &flow->t_root); |
771 | rtnl_kfree_skbs(head: skb, tail: skb); |
772 | } |
773 | rtnl_kfree_skbs(head: flow->head, tail: flow->tail); |
774 | flow->head = NULL; |
775 | flow->qlen = 0; |
776 | } |
777 | |
778 | static void fq_reset(struct Qdisc *sch) |
779 | { |
780 | struct fq_sched_data *q = qdisc_priv(sch); |
781 | struct rb_root *root; |
782 | struct rb_node *p; |
783 | struct fq_flow *f; |
784 | unsigned int idx; |
785 | |
786 | sch->q.qlen = 0; |
787 | sch->qstats.backlog = 0; |
788 | |
789 | fq_flow_purge(flow: &q->internal); |
790 | |
791 | if (!q->fq_root) |
792 | return; |
793 | |
794 | for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { |
795 | root = &q->fq_root[idx]; |
796 | while ((p = rb_first(root)) != NULL) { |
797 | f = rb_entry(p, struct fq_flow, fq_node); |
798 | rb_erase(p, root); |
799 | |
800 | fq_flow_purge(flow: f); |
801 | |
802 | kmem_cache_free(s: fq_flow_cachep, objp: f); |
803 | } |
804 | } |
805 | for (idx = 0; idx < FQ_BANDS; idx++) { |
806 | q->band_flows[idx].new_flows.first = NULL; |
807 | q->band_flows[idx].old_flows.first = NULL; |
808 | } |
809 | q->delayed = RB_ROOT; |
810 | q->flows = 0; |
811 | q->inactive_flows = 0; |
812 | q->throttled_flows = 0; |
813 | } |
814 | |
815 | static void fq_rehash(struct fq_sched_data *q, |
816 | struct rb_root *old_array, u32 old_log, |
817 | struct rb_root *new_array, u32 new_log) |
818 | { |
819 | struct rb_node *op, **np, *parent; |
820 | struct rb_root *oroot, *nroot; |
821 | struct fq_flow *of, *nf; |
822 | int fcnt = 0; |
823 | u32 idx; |
824 | |
825 | for (idx = 0; idx < (1U << old_log); idx++) { |
826 | oroot = &old_array[idx]; |
827 | while ((op = rb_first(oroot)) != NULL) { |
828 | rb_erase(op, oroot); |
829 | of = rb_entry(op, struct fq_flow, fq_node); |
830 | if (fq_gc_candidate(f: of)) { |
831 | fcnt++; |
832 | kmem_cache_free(s: fq_flow_cachep, objp: of); |
833 | continue; |
834 | } |
835 | nroot = &new_array[hash_ptr(ptr: of->sk, bits: new_log)]; |
836 | |
837 | np = &nroot->rb_node; |
838 | parent = NULL; |
839 | while (*np) { |
840 | parent = *np; |
841 | |
842 | nf = rb_entry(parent, struct fq_flow, fq_node); |
843 | BUG_ON(nf->sk == of->sk); |
844 | |
845 | if (nf->sk > of->sk) |
846 | np = &parent->rb_right; |
847 | else |
848 | np = &parent->rb_left; |
849 | } |
850 | |
851 | rb_link_node(node: &of->fq_node, parent, rb_link: np); |
852 | rb_insert_color(&of->fq_node, nroot); |
853 | } |
854 | } |
855 | q->flows -= fcnt; |
856 | q->inactive_flows -= fcnt; |
857 | q->stat_gc_flows += fcnt; |
858 | } |
859 | |
860 | static void fq_free(void *addr) |
861 | { |
862 | kvfree(addr); |
863 | } |
864 | |
865 | static int fq_resize(struct Qdisc *sch, u32 log) |
866 | { |
867 | struct fq_sched_data *q = qdisc_priv(sch); |
868 | struct rb_root *array; |
869 | void *old_fq_root; |
870 | u32 idx; |
871 | |
872 | if (q->fq_root && log == q->fq_trees_log) |
873 | return 0; |
874 | |
875 | /* If XPS was setup, we can allocate memory on right NUMA node */ |
876 | array = kvmalloc_node(size: sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL, |
877 | node: netdev_queue_numa_node_read(q: sch->dev_queue)); |
878 | if (!array) |
879 | return -ENOMEM; |
880 | |
881 | for (idx = 0; idx < (1U << log); idx++) |
882 | array[idx] = RB_ROOT; |
883 | |
884 | sch_tree_lock(q: sch); |
885 | |
886 | old_fq_root = q->fq_root; |
887 | if (old_fq_root) |
888 | fq_rehash(q, old_array: old_fq_root, old_log: q->fq_trees_log, new_array: array, new_log: log); |
889 | |
890 | q->fq_root = array; |
891 | q->fq_trees_log = log; |
892 | |
893 | sch_tree_unlock(q: sch); |
894 | |
895 | fq_free(addr: old_fq_root); |
896 | |
897 | return 0; |
898 | } |
899 | |
900 | static const struct netlink_range_validation iq_range = { |
901 | .max = INT_MAX, |
902 | }; |
903 | |
904 | static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { |
905 | [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, |
906 | |
907 | [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, |
908 | [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, |
909 | [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, |
910 | [TCA_FQ_INITIAL_QUANTUM] = NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range), |
911 | [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, |
912 | [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, |
913 | [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, |
914 | [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, |
915 | [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, |
916 | [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, |
917 | [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, |
918 | [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, |
919 | [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, |
920 | [TCA_FQ_HORIZON] = { .type = NLA_U32 }, |
921 | [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, |
922 | [TCA_FQ_PRIOMAP] = { |
923 | .type = NLA_BINARY, |
924 | .len = sizeof(struct tc_prio_qopt), |
925 | }, |
926 | [TCA_FQ_WEIGHTS] = { |
927 | .type = NLA_BINARY, |
928 | .len = FQ_BANDS * sizeof(s32), |
929 | }, |
930 | }; |
931 | |
932 | /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ |
933 | static void fq_prio2band_compress_crumb(const u8 *in, u8 *out) |
934 | { |
935 | const int num_elems = TC_PRIO_MAX + 1; |
936 | int i; |
937 | |
938 | memset(out, 0, num_elems / 4); |
939 | for (i = 0; i < num_elems; i++) |
940 | out[i / 4] |= in[i] << (2 * (i & 0x3)); |
941 | } |
942 | |
943 | static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out) |
944 | { |
945 | const int num_elems = TC_PRIO_MAX + 1; |
946 | int i; |
947 | |
948 | for (i = 0; i < num_elems; i++) |
949 | out[i] = fq_prio2band(prio2band: in, prio: i); |
950 | } |
951 | |
952 | static int fq_load_weights(struct fq_sched_data *q, |
953 | const struct nlattr *attr, |
954 | struct netlink_ext_ack *extack) |
955 | { |
956 | s32 *weights = nla_data(nla: attr); |
957 | int i; |
958 | |
959 | for (i = 0; i < FQ_BANDS; i++) { |
960 | if (weights[i] < FQ_MIN_WEIGHT) { |
961 | NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d" , |
962 | weights[i], FQ_MIN_WEIGHT); |
963 | return -EINVAL; |
964 | } |
965 | } |
966 | for (i = 0; i < FQ_BANDS; i++) |
967 | q->band_flows[i].quantum = weights[i]; |
968 | return 0; |
969 | } |
970 | |
971 | static int fq_load_priomap(struct fq_sched_data *q, |
972 | const struct nlattr *attr, |
973 | struct netlink_ext_ack *extack) |
974 | { |
975 | const struct tc_prio_qopt *map = nla_data(nla: attr); |
976 | int i; |
977 | |
978 | if (map->bands != FQ_BANDS) { |
979 | NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands" ); |
980 | return -EINVAL; |
981 | } |
982 | for (i = 0; i < TC_PRIO_MAX + 1; i++) { |
983 | if (map->priomap[i] >= FQ_BANDS) { |
984 | NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d" , |
985 | i, map->priomap[i]); |
986 | return -EINVAL; |
987 | } |
988 | } |
989 | fq_prio2band_compress_crumb(in: map->priomap, out: q->prio2band); |
990 | return 0; |
991 | } |
992 | |
993 | static int fq_change(struct Qdisc *sch, struct nlattr *opt, |
994 | struct netlink_ext_ack *extack) |
995 | { |
996 | struct fq_sched_data *q = qdisc_priv(sch); |
997 | struct nlattr *tb[TCA_FQ_MAX + 1]; |
998 | int err, drop_count = 0; |
999 | unsigned drop_len = 0; |
1000 | u32 fq_log; |
1001 | |
1002 | err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, nla: opt, policy: fq_policy, |
1003 | NULL); |
1004 | if (err < 0) |
1005 | return err; |
1006 | |
1007 | sch_tree_lock(q: sch); |
1008 | |
1009 | fq_log = q->fq_trees_log; |
1010 | |
1011 | if (tb[TCA_FQ_BUCKETS_LOG]) { |
1012 | u32 nval = nla_get_u32(nla: tb[TCA_FQ_BUCKETS_LOG]); |
1013 | |
1014 | if (nval >= 1 && nval <= ilog2(256*1024)) |
1015 | fq_log = nval; |
1016 | else |
1017 | err = -EINVAL; |
1018 | } |
1019 | if (tb[TCA_FQ_PLIMIT]) |
1020 | sch->limit = nla_get_u32(nla: tb[TCA_FQ_PLIMIT]); |
1021 | |
1022 | if (tb[TCA_FQ_FLOW_PLIMIT]) |
1023 | q->flow_plimit = nla_get_u32(nla: tb[TCA_FQ_FLOW_PLIMIT]); |
1024 | |
1025 | if (tb[TCA_FQ_QUANTUM]) { |
1026 | u32 quantum = nla_get_u32(nla: tb[TCA_FQ_QUANTUM]); |
1027 | |
1028 | if (quantum > 0 && quantum <= (1 << 20)) { |
1029 | q->quantum = quantum; |
1030 | } else { |
1031 | NL_SET_ERR_MSG_MOD(extack, "invalid quantum" ); |
1032 | err = -EINVAL; |
1033 | } |
1034 | } |
1035 | |
1036 | if (tb[TCA_FQ_INITIAL_QUANTUM]) |
1037 | q->initial_quantum = nla_get_u32(nla: tb[TCA_FQ_INITIAL_QUANTUM]); |
1038 | |
1039 | if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) |
1040 | pr_warn_ratelimited("sch_fq: defrate %u ignored.\n" , |
1041 | nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); |
1042 | |
1043 | if (tb[TCA_FQ_FLOW_MAX_RATE]) { |
1044 | u32 rate = nla_get_u32(nla: tb[TCA_FQ_FLOW_MAX_RATE]); |
1045 | |
1046 | q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; |
1047 | } |
1048 | if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) |
1049 | q->low_rate_threshold = |
1050 | nla_get_u32(nla: tb[TCA_FQ_LOW_RATE_THRESHOLD]); |
1051 | |
1052 | if (tb[TCA_FQ_RATE_ENABLE]) { |
1053 | u32 enable = nla_get_u32(nla: tb[TCA_FQ_RATE_ENABLE]); |
1054 | |
1055 | if (enable <= 1) |
1056 | q->rate_enable = enable; |
1057 | else |
1058 | err = -EINVAL; |
1059 | } |
1060 | |
1061 | if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { |
1062 | u32 usecs_delay = nla_get_u32(nla: tb[TCA_FQ_FLOW_REFILL_DELAY]) ; |
1063 | |
1064 | q->flow_refill_delay = usecs_to_jiffies(u: usecs_delay); |
1065 | } |
1066 | |
1067 | if (!err && tb[TCA_FQ_PRIOMAP]) |
1068 | err = fq_load_priomap(q, attr: tb[TCA_FQ_PRIOMAP], extack); |
1069 | |
1070 | if (!err && tb[TCA_FQ_WEIGHTS]) |
1071 | err = fq_load_weights(q, attr: tb[TCA_FQ_WEIGHTS], extack); |
1072 | |
1073 | if (tb[TCA_FQ_ORPHAN_MASK]) |
1074 | q->orphan_mask = nla_get_u32(nla: tb[TCA_FQ_ORPHAN_MASK]); |
1075 | |
1076 | if (tb[TCA_FQ_CE_THRESHOLD]) |
1077 | q->ce_threshold = (u64)NSEC_PER_USEC * |
1078 | nla_get_u32(nla: tb[TCA_FQ_CE_THRESHOLD]); |
1079 | |
1080 | if (tb[TCA_FQ_TIMER_SLACK]) |
1081 | q->timer_slack = nla_get_u32(nla: tb[TCA_FQ_TIMER_SLACK]); |
1082 | |
1083 | if (tb[TCA_FQ_HORIZON]) |
1084 | q->horizon = (u64)NSEC_PER_USEC * |
1085 | nla_get_u32(nla: tb[TCA_FQ_HORIZON]); |
1086 | |
1087 | if (tb[TCA_FQ_HORIZON_DROP]) |
1088 | q->horizon_drop = nla_get_u8(nla: tb[TCA_FQ_HORIZON_DROP]); |
1089 | |
1090 | if (!err) { |
1091 | |
1092 | sch_tree_unlock(q: sch); |
1093 | err = fq_resize(sch, log: fq_log); |
1094 | sch_tree_lock(q: sch); |
1095 | } |
1096 | while (sch->q.qlen > sch->limit) { |
1097 | struct sk_buff *skb = fq_dequeue(sch); |
1098 | |
1099 | if (!skb) |
1100 | break; |
1101 | drop_len += qdisc_pkt_len(skb); |
1102 | rtnl_kfree_skbs(head: skb, tail: skb); |
1103 | drop_count++; |
1104 | } |
1105 | qdisc_tree_reduce_backlog(qdisc: sch, n: drop_count, len: drop_len); |
1106 | |
1107 | sch_tree_unlock(q: sch); |
1108 | return err; |
1109 | } |
1110 | |
1111 | static void fq_destroy(struct Qdisc *sch) |
1112 | { |
1113 | struct fq_sched_data *q = qdisc_priv(sch); |
1114 | |
1115 | fq_reset(sch); |
1116 | fq_free(addr: q->fq_root); |
1117 | qdisc_watchdog_cancel(wd: &q->watchdog); |
1118 | } |
1119 | |
1120 | static int fq_init(struct Qdisc *sch, struct nlattr *opt, |
1121 | struct netlink_ext_ack *extack) |
1122 | { |
1123 | struct fq_sched_data *q = qdisc_priv(sch); |
1124 | int i, err; |
1125 | |
1126 | sch->limit = 10000; |
1127 | q->flow_plimit = 100; |
1128 | q->quantum = 2 * psched_mtu(dev: qdisc_dev(qdisc: sch)); |
1129 | q->initial_quantum = 10 * psched_mtu(dev: qdisc_dev(qdisc: sch)); |
1130 | q->flow_refill_delay = msecs_to_jiffies(m: 40); |
1131 | q->flow_max_rate = ~0UL; |
1132 | q->time_next_delayed_flow = ~0ULL; |
1133 | q->rate_enable = 1; |
1134 | for (i = 0; i < FQ_BANDS; i++) { |
1135 | q->band_flows[i].new_flows.first = NULL; |
1136 | q->band_flows[i].old_flows.first = NULL; |
1137 | } |
1138 | q->band_flows[0].quantum = 9 << 16; |
1139 | q->band_flows[1].quantum = 3 << 16; |
1140 | q->band_flows[2].quantum = 1 << 16; |
1141 | q->delayed = RB_ROOT; |
1142 | q->fq_root = NULL; |
1143 | q->fq_trees_log = ilog2(1024); |
1144 | q->orphan_mask = 1024 - 1; |
1145 | q->low_rate_threshold = 550000 / 8; |
1146 | |
1147 | q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ |
1148 | |
1149 | q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ |
1150 | q->horizon_drop = 1; /* by default, drop packets beyond horizon */ |
1151 | |
1152 | /* Default ce_threshold of 4294 seconds */ |
1153 | q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; |
1154 | |
1155 | fq_prio2band_compress_crumb(in: sch_default_prio2band, out: q->prio2band); |
1156 | qdisc_watchdog_init_clockid(wd: &q->watchdog, qdisc: sch, CLOCK_MONOTONIC); |
1157 | |
1158 | if (opt) |
1159 | err = fq_change(sch, opt, extack); |
1160 | else |
1161 | err = fq_resize(sch, log: q->fq_trees_log); |
1162 | |
1163 | return err; |
1164 | } |
1165 | |
1166 | static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) |
1167 | { |
1168 | struct fq_sched_data *q = qdisc_priv(sch); |
1169 | u64 ce_threshold = q->ce_threshold; |
1170 | struct tc_prio_qopt prio = { |
1171 | .bands = FQ_BANDS, |
1172 | }; |
1173 | u64 horizon = q->horizon; |
1174 | struct nlattr *opts; |
1175 | s32 weights[3]; |
1176 | |
1177 | opts = nla_nest_start_noflag(skb, attrtype: TCA_OPTIONS); |
1178 | if (opts == NULL) |
1179 | goto nla_put_failure; |
1180 | |
1181 | /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ |
1182 | |
1183 | do_div(ce_threshold, NSEC_PER_USEC); |
1184 | do_div(horizon, NSEC_PER_USEC); |
1185 | |
1186 | if (nla_put_u32(skb, attrtype: TCA_FQ_PLIMIT, value: sch->limit) || |
1187 | nla_put_u32(skb, attrtype: TCA_FQ_FLOW_PLIMIT, value: q->flow_plimit) || |
1188 | nla_put_u32(skb, attrtype: TCA_FQ_QUANTUM, value: q->quantum) || |
1189 | nla_put_u32(skb, attrtype: TCA_FQ_INITIAL_QUANTUM, value: q->initial_quantum) || |
1190 | nla_put_u32(skb, attrtype: TCA_FQ_RATE_ENABLE, value: q->rate_enable) || |
1191 | nla_put_u32(skb, attrtype: TCA_FQ_FLOW_MAX_RATE, |
1192 | min_t(unsigned long, q->flow_max_rate, ~0U)) || |
1193 | nla_put_u32(skb, attrtype: TCA_FQ_FLOW_REFILL_DELAY, |
1194 | value: jiffies_to_usecs(j: q->flow_refill_delay)) || |
1195 | nla_put_u32(skb, attrtype: TCA_FQ_ORPHAN_MASK, value: q->orphan_mask) || |
1196 | nla_put_u32(skb, attrtype: TCA_FQ_LOW_RATE_THRESHOLD, |
1197 | value: q->low_rate_threshold) || |
1198 | nla_put_u32(skb, attrtype: TCA_FQ_CE_THRESHOLD, value: (u32)ce_threshold) || |
1199 | nla_put_u32(skb, attrtype: TCA_FQ_BUCKETS_LOG, value: q->fq_trees_log) || |
1200 | nla_put_u32(skb, attrtype: TCA_FQ_TIMER_SLACK, value: q->timer_slack) || |
1201 | nla_put_u32(skb, attrtype: TCA_FQ_HORIZON, value: (u32)horizon) || |
1202 | nla_put_u8(skb, attrtype: TCA_FQ_HORIZON_DROP, value: q->horizon_drop)) |
1203 | goto nla_put_failure; |
1204 | |
1205 | fq_prio2band_decompress_crumb(in: q->prio2band, out: prio.priomap); |
1206 | if (nla_put(skb, attrtype: TCA_FQ_PRIOMAP, attrlen: sizeof(prio), data: &prio)) |
1207 | goto nla_put_failure; |
1208 | |
1209 | weights[0] = q->band_flows[0].quantum; |
1210 | weights[1] = q->band_flows[1].quantum; |
1211 | weights[2] = q->band_flows[2].quantum; |
1212 | if (nla_put(skb, attrtype: TCA_FQ_WEIGHTS, attrlen: sizeof(weights), data: &weights)) |
1213 | goto nla_put_failure; |
1214 | |
1215 | return nla_nest_end(skb, start: opts); |
1216 | |
1217 | nla_put_failure: |
1218 | return -1; |
1219 | } |
1220 | |
1221 | static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) |
1222 | { |
1223 | struct fq_sched_data *q = qdisc_priv(sch); |
1224 | struct tc_fq_qd_stats st; |
1225 | int i; |
1226 | |
1227 | st.pad = 0; |
1228 | |
1229 | sch_tree_lock(q: sch); |
1230 | |
1231 | st.gc_flows = q->stat_gc_flows; |
1232 | st.highprio_packets = 0; |
1233 | st.fastpath_packets = q->internal.stat_fastpath_packets; |
1234 | st.tcp_retrans = 0; |
1235 | st.throttled = q->stat_throttled; |
1236 | st.flows_plimit = q->stat_flows_plimit; |
1237 | st.pkts_too_long = q->stat_pkts_too_long; |
1238 | st.allocation_errors = q->stat_allocation_errors; |
1239 | st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - |
1240 | ktime_get_ns(); |
1241 | st.flows = q->flows; |
1242 | st.inactive_flows = q->inactive_flows; |
1243 | st.throttled_flows = q->throttled_flows; |
1244 | st.unthrottle_latency_ns = min_t(unsigned long, |
1245 | q->unthrottle_latency_ns, ~0U); |
1246 | st.ce_mark = q->stat_ce_mark; |
1247 | st.horizon_drops = q->stat_horizon_drops; |
1248 | st.horizon_caps = q->stat_horizon_caps; |
1249 | for (i = 0; i < FQ_BANDS; i++) { |
1250 | st.band_drops[i] = q->stat_band_drops[i]; |
1251 | st.band_pkt_count[i] = q->band_pkt_count[i]; |
1252 | } |
1253 | sch_tree_unlock(q: sch); |
1254 | |
1255 | return gnet_stats_copy_app(d, st: &st, len: sizeof(st)); |
1256 | } |
1257 | |
1258 | static struct Qdisc_ops fq_qdisc_ops __read_mostly = { |
1259 | .id = "fq" , |
1260 | .priv_size = sizeof(struct fq_sched_data), |
1261 | |
1262 | .enqueue = fq_enqueue, |
1263 | .dequeue = fq_dequeue, |
1264 | .peek = qdisc_peek_dequeued, |
1265 | .init = fq_init, |
1266 | .reset = fq_reset, |
1267 | .destroy = fq_destroy, |
1268 | .change = fq_change, |
1269 | .dump = fq_dump, |
1270 | .dump_stats = fq_dump_stats, |
1271 | .owner = THIS_MODULE, |
1272 | }; |
1273 | |
1274 | static int __init fq_module_init(void) |
1275 | { |
1276 | int ret; |
1277 | |
1278 | fq_flow_cachep = kmem_cache_create(name: "fq_flow_cache" , |
1279 | size: sizeof(struct fq_flow), |
1280 | align: 0, SLAB_HWCACHE_ALIGN, NULL); |
1281 | if (!fq_flow_cachep) |
1282 | return -ENOMEM; |
1283 | |
1284 | ret = register_qdisc(qops: &fq_qdisc_ops); |
1285 | if (ret) |
1286 | kmem_cache_destroy(s: fq_flow_cachep); |
1287 | return ret; |
1288 | } |
1289 | |
1290 | static void __exit fq_module_exit(void) |
1291 | { |
1292 | unregister_qdisc(qops: &fq_qdisc_ops); |
1293 | kmem_cache_destroy(s: fq_flow_cachep); |
1294 | } |
1295 | |
1296 | module_init(fq_module_init) |
1297 | module_exit(fq_module_exit) |
1298 | MODULE_AUTHOR("Eric Dumazet" ); |
1299 | MODULE_LICENSE("GPL" ); |
1300 | MODULE_DESCRIPTION("Fair Queue Packet Scheduler" ); |
1301 | |