1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * net/sched/sch_generic.c Generic packet scheduler routines. |
4 | * |
5 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> |
6 | * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 |
7 | * - Ingress support |
8 | */ |
9 | |
10 | #include <linux/bitops.h> |
11 | #include <linux/module.h> |
12 | #include <linux/types.h> |
13 | #include <linux/kernel.h> |
14 | #include <linux/sched.h> |
15 | #include <linux/string.h> |
16 | #include <linux/errno.h> |
17 | #include <linux/netdevice.h> |
18 | #include <linux/skbuff.h> |
19 | #include <linux/rtnetlink.h> |
20 | #include <linux/init.h> |
21 | #include <linux/rcupdate.h> |
22 | #include <linux/list.h> |
23 | #include <linux/slab.h> |
24 | #include <linux/if_vlan.h> |
25 | #include <linux/skb_array.h> |
26 | #include <linux/if_macvlan.h> |
27 | #include <net/sch_generic.h> |
28 | #include <net/pkt_sched.h> |
29 | #include <net/dst.h> |
30 | #include <trace/events/qdisc.h> |
31 | #include <trace/events/net.h> |
32 | #include <net/xfrm.h> |
33 | |
34 | /* Qdisc to use by default */ |
35 | const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; |
36 | EXPORT_SYMBOL(default_qdisc_ops); |
37 | |
38 | static void qdisc_maybe_clear_missed(struct Qdisc *q, |
39 | const struct netdev_queue *txq) |
40 | { |
41 | clear_bit(nr: __QDISC_STATE_MISSED, addr: &q->state); |
42 | |
43 | /* Make sure the below netif_xmit_frozen_or_stopped() |
44 | * checking happens after clearing STATE_MISSED. |
45 | */ |
46 | smp_mb__after_atomic(); |
47 | |
48 | /* Checking netif_xmit_frozen_or_stopped() again to |
49 | * make sure STATE_MISSED is set if the STATE_MISSED |
50 | * set by netif_tx_wake_queue()'s rescheduling of |
51 | * net_tx_action() is cleared by the above clear_bit(). |
52 | */ |
53 | if (!netif_xmit_frozen_or_stopped(dev_queue: txq)) |
54 | set_bit(nr: __QDISC_STATE_MISSED, addr: &q->state); |
55 | else |
56 | set_bit(nr: __QDISC_STATE_DRAINING, addr: &q->state); |
57 | } |
58 | |
59 | /* Main transmission queue. */ |
60 | |
61 | /* Modifications to data participating in scheduling must be protected with |
62 | * qdisc_lock(qdisc) spinlock. |
63 | * |
64 | * The idea is the following: |
65 | * - enqueue, dequeue are serialized via qdisc root lock |
66 | * - ingress filtering is also serialized via qdisc root lock |
67 | * - updates to tree and tree walking are only done under the rtnl mutex. |
68 | */ |
69 | |
70 | #define SKB_XOFF_MAGIC ((struct sk_buff *)1UL) |
71 | |
72 | static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) |
73 | { |
74 | const struct netdev_queue *txq = q->dev_queue; |
75 | spinlock_t *lock = NULL; |
76 | struct sk_buff *skb; |
77 | |
78 | if (q->flags & TCQ_F_NOLOCK) { |
79 | lock = qdisc_lock(qdisc: q); |
80 | spin_lock(lock); |
81 | } |
82 | |
83 | skb = skb_peek(list_: &q->skb_bad_txq); |
84 | if (skb) { |
85 | /* check the reason of requeuing without tx lock first */ |
86 | txq = skb_get_tx_queue(dev: txq->dev, skb); |
87 | if (!netif_xmit_frozen_or_stopped(dev_queue: txq)) { |
88 | skb = __skb_dequeue(list: &q->skb_bad_txq); |
89 | if (qdisc_is_percpu_stats(q)) { |
90 | qdisc_qstats_cpu_backlog_dec(sch: q, skb); |
91 | qdisc_qstats_cpu_qlen_dec(sch: q); |
92 | } else { |
93 | qdisc_qstats_backlog_dec(sch: q, skb); |
94 | q->q.qlen--; |
95 | } |
96 | } else { |
97 | skb = SKB_XOFF_MAGIC; |
98 | qdisc_maybe_clear_missed(q, txq); |
99 | } |
100 | } |
101 | |
102 | if (lock) |
103 | spin_unlock(lock); |
104 | |
105 | return skb; |
106 | } |
107 | |
108 | static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) |
109 | { |
110 | struct sk_buff *skb = skb_peek(list_: &q->skb_bad_txq); |
111 | |
112 | if (unlikely(skb)) |
113 | skb = __skb_dequeue_bad_txq(q); |
114 | |
115 | return skb; |
116 | } |
117 | |
118 | static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, |
119 | struct sk_buff *skb) |
120 | { |
121 | spinlock_t *lock = NULL; |
122 | |
123 | if (q->flags & TCQ_F_NOLOCK) { |
124 | lock = qdisc_lock(qdisc: q); |
125 | spin_lock(lock); |
126 | } |
127 | |
128 | __skb_queue_tail(list: &q->skb_bad_txq, newsk: skb); |
129 | |
130 | if (qdisc_is_percpu_stats(q)) { |
131 | qdisc_qstats_cpu_backlog_inc(sch: q, skb); |
132 | qdisc_qstats_cpu_qlen_inc(sch: q); |
133 | } else { |
134 | qdisc_qstats_backlog_inc(sch: q, skb); |
135 | q->q.qlen++; |
136 | } |
137 | |
138 | if (lock) |
139 | spin_unlock(lock); |
140 | } |
141 | |
142 | static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) |
143 | { |
144 | spinlock_t *lock = NULL; |
145 | |
146 | if (q->flags & TCQ_F_NOLOCK) { |
147 | lock = qdisc_lock(qdisc: q); |
148 | spin_lock(lock); |
149 | } |
150 | |
151 | while (skb) { |
152 | struct sk_buff *next = skb->next; |
153 | |
154 | __skb_queue_tail(list: &q->gso_skb, newsk: skb); |
155 | |
156 | /* it's still part of the queue */ |
157 | if (qdisc_is_percpu_stats(q)) { |
158 | qdisc_qstats_cpu_requeues_inc(sch: q); |
159 | qdisc_qstats_cpu_backlog_inc(sch: q, skb); |
160 | qdisc_qstats_cpu_qlen_inc(sch: q); |
161 | } else { |
162 | q->qstats.requeues++; |
163 | qdisc_qstats_backlog_inc(sch: q, skb); |
164 | q->q.qlen++; |
165 | } |
166 | |
167 | skb = next; |
168 | } |
169 | |
170 | if (lock) { |
171 | spin_unlock(lock); |
172 | set_bit(nr: __QDISC_STATE_MISSED, addr: &q->state); |
173 | } else { |
174 | __netif_schedule(q); |
175 | } |
176 | } |
177 | |
178 | static void try_bulk_dequeue_skb(struct Qdisc *q, |
179 | struct sk_buff *skb, |
180 | const struct netdev_queue *txq, |
181 | int *packets) |
182 | { |
183 | int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; |
184 | |
185 | while (bytelimit > 0) { |
186 | struct sk_buff *nskb = q->dequeue(q); |
187 | |
188 | if (!nskb) |
189 | break; |
190 | |
191 | bytelimit -= nskb->len; /* covers GSO len */ |
192 | skb->next = nskb; |
193 | skb = nskb; |
194 | (*packets)++; /* GSO counts as one pkt */ |
195 | } |
196 | skb_mark_not_on_list(skb); |
197 | } |
198 | |
199 | /* This variant of try_bulk_dequeue_skb() makes sure |
200 | * all skbs in the chain are for the same txq |
201 | */ |
202 | static void try_bulk_dequeue_skb_slow(struct Qdisc *q, |
203 | struct sk_buff *skb, |
204 | int *packets) |
205 | { |
206 | int mapping = skb_get_queue_mapping(skb); |
207 | struct sk_buff *nskb; |
208 | int cnt = 0; |
209 | |
210 | do { |
211 | nskb = q->dequeue(q); |
212 | if (!nskb) |
213 | break; |
214 | if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { |
215 | qdisc_enqueue_skb_bad_txq(q, skb: nskb); |
216 | break; |
217 | } |
218 | skb->next = nskb; |
219 | skb = nskb; |
220 | } while (++cnt < 8); |
221 | (*packets) += cnt; |
222 | skb_mark_not_on_list(skb); |
223 | } |
224 | |
225 | /* Note that dequeue_skb can possibly return a SKB list (via skb->next). |
226 | * A requeued skb (via q->gso_skb) can also be a SKB list. |
227 | */ |
228 | static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, |
229 | int *packets) |
230 | { |
231 | const struct netdev_queue *txq = q->dev_queue; |
232 | struct sk_buff *skb = NULL; |
233 | |
234 | *packets = 1; |
235 | if (unlikely(!skb_queue_empty(&q->gso_skb))) { |
236 | spinlock_t *lock = NULL; |
237 | |
238 | if (q->flags & TCQ_F_NOLOCK) { |
239 | lock = qdisc_lock(qdisc: q); |
240 | spin_lock(lock); |
241 | } |
242 | |
243 | skb = skb_peek(list_: &q->gso_skb); |
244 | |
245 | /* skb may be null if another cpu pulls gso_skb off in between |
246 | * empty check and lock. |
247 | */ |
248 | if (!skb) { |
249 | if (lock) |
250 | spin_unlock(lock); |
251 | goto validate; |
252 | } |
253 | |
254 | /* skb in gso_skb were already validated */ |
255 | *validate = false; |
256 | if (xfrm_offload(skb)) |
257 | *validate = true; |
258 | /* check the reason of requeuing without tx lock first */ |
259 | txq = skb_get_tx_queue(dev: txq->dev, skb); |
260 | if (!netif_xmit_frozen_or_stopped(dev_queue: txq)) { |
261 | skb = __skb_dequeue(list: &q->gso_skb); |
262 | if (qdisc_is_percpu_stats(q)) { |
263 | qdisc_qstats_cpu_backlog_dec(sch: q, skb); |
264 | qdisc_qstats_cpu_qlen_dec(sch: q); |
265 | } else { |
266 | qdisc_qstats_backlog_dec(sch: q, skb); |
267 | q->q.qlen--; |
268 | } |
269 | } else { |
270 | skb = NULL; |
271 | qdisc_maybe_clear_missed(q, txq); |
272 | } |
273 | if (lock) |
274 | spin_unlock(lock); |
275 | goto trace; |
276 | } |
277 | validate: |
278 | *validate = true; |
279 | |
280 | if ((q->flags & TCQ_F_ONETXQUEUE) && |
281 | netif_xmit_frozen_or_stopped(dev_queue: txq)) { |
282 | qdisc_maybe_clear_missed(q, txq); |
283 | return skb; |
284 | } |
285 | |
286 | skb = qdisc_dequeue_skb_bad_txq(q); |
287 | if (unlikely(skb)) { |
288 | if (skb == SKB_XOFF_MAGIC) |
289 | return NULL; |
290 | goto bulk; |
291 | } |
292 | skb = q->dequeue(q); |
293 | if (skb) { |
294 | bulk: |
295 | if (qdisc_may_bulk(qdisc: q)) |
296 | try_bulk_dequeue_skb(q, skb, txq, packets); |
297 | else |
298 | try_bulk_dequeue_skb_slow(q, skb, packets); |
299 | } |
300 | trace: |
301 | trace_qdisc_dequeue(qdisc: q, txq, packets: *packets, skb); |
302 | return skb; |
303 | } |
304 | |
305 | /* |
306 | * Transmit possibly several skbs, and handle the return status as |
307 | * required. Owning qdisc running bit guarantees that only one CPU |
308 | * can execute this function. |
309 | * |
310 | * Returns to the caller: |
311 | * false - hardware queue frozen backoff |
312 | * true - feel free to send more pkts |
313 | */ |
314 | bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, |
315 | struct net_device *dev, struct netdev_queue *txq, |
316 | spinlock_t *root_lock, bool validate) |
317 | { |
318 | int ret = NETDEV_TX_BUSY; |
319 | bool again = false; |
320 | |
321 | /* And release qdisc */ |
322 | if (root_lock) |
323 | spin_unlock(lock: root_lock); |
324 | |
325 | /* Note that we validate skb (GSO, checksum, ...) outside of locks */ |
326 | if (validate) |
327 | skb = validate_xmit_skb_list(skb, dev, again: &again); |
328 | |
329 | #ifdef CONFIG_XFRM_OFFLOAD |
330 | if (unlikely(again)) { |
331 | if (root_lock) |
332 | spin_lock(lock: root_lock); |
333 | |
334 | dev_requeue_skb(skb, q); |
335 | return false; |
336 | } |
337 | #endif |
338 | |
339 | if (likely(skb)) { |
340 | HARD_TX_LOCK(dev, txq, smp_processor_id()); |
341 | if (!netif_xmit_frozen_or_stopped(dev_queue: txq)) |
342 | skb = dev_hard_start_xmit(skb, dev, txq, ret: &ret); |
343 | else |
344 | qdisc_maybe_clear_missed(q, txq); |
345 | |
346 | HARD_TX_UNLOCK(dev, txq); |
347 | } else { |
348 | if (root_lock) |
349 | spin_lock(lock: root_lock); |
350 | return true; |
351 | } |
352 | |
353 | if (root_lock) |
354 | spin_lock(lock: root_lock); |
355 | |
356 | if (!dev_xmit_complete(rc: ret)) { |
357 | /* Driver returned NETDEV_TX_BUSY - requeue skb */ |
358 | if (unlikely(ret != NETDEV_TX_BUSY)) |
359 | net_warn_ratelimited("BUG %s code %d qlen %d\n" , |
360 | dev->name, ret, q->q.qlen); |
361 | |
362 | dev_requeue_skb(skb, q); |
363 | return false; |
364 | } |
365 | |
366 | return true; |
367 | } |
368 | |
369 | /* |
370 | * NOTE: Called under qdisc_lock(q) with locally disabled BH. |
371 | * |
372 | * running seqcount guarantees only one CPU can process |
373 | * this qdisc at a time. qdisc_lock(q) serializes queue accesses for |
374 | * this queue. |
375 | * |
376 | * netif_tx_lock serializes accesses to device driver. |
377 | * |
378 | * qdisc_lock(q) and netif_tx_lock are mutually exclusive, |
379 | * if one is grabbed, another must be free. |
380 | * |
381 | * Note, that this procedure can be called by a watchdog timer |
382 | * |
383 | * Returns to the caller: |
384 | * 0 - queue is empty or throttled. |
385 | * >0 - queue is not empty. |
386 | * |
387 | */ |
388 | static inline bool qdisc_restart(struct Qdisc *q, int *packets) |
389 | { |
390 | spinlock_t *root_lock = NULL; |
391 | struct netdev_queue *txq; |
392 | struct net_device *dev; |
393 | struct sk_buff *skb; |
394 | bool validate; |
395 | |
396 | /* Dequeue packet */ |
397 | skb = dequeue_skb(q, validate: &validate, packets); |
398 | if (unlikely(!skb)) |
399 | return false; |
400 | |
401 | if (!(q->flags & TCQ_F_NOLOCK)) |
402 | root_lock = qdisc_lock(qdisc: q); |
403 | |
404 | dev = qdisc_dev(qdisc: q); |
405 | txq = skb_get_tx_queue(dev, skb); |
406 | |
407 | return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); |
408 | } |
409 | |
410 | void __qdisc_run(struct Qdisc *q) |
411 | { |
412 | int quota = READ_ONCE(dev_tx_weight); |
413 | int packets; |
414 | |
415 | while (qdisc_restart(q, packets: &packets)) { |
416 | quota -= packets; |
417 | if (quota <= 0) { |
418 | if (q->flags & TCQ_F_NOLOCK) |
419 | set_bit(nr: __QDISC_STATE_MISSED, addr: &q->state); |
420 | else |
421 | __netif_schedule(q); |
422 | |
423 | break; |
424 | } |
425 | } |
426 | } |
427 | |
428 | unsigned long dev_trans_start(struct net_device *dev) |
429 | { |
430 | unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); |
431 | unsigned long val; |
432 | unsigned int i; |
433 | |
434 | for (i = 1; i < dev->num_tx_queues; i++) { |
435 | val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start); |
436 | if (val && time_after(val, res)) |
437 | res = val; |
438 | } |
439 | |
440 | return res; |
441 | } |
442 | EXPORT_SYMBOL(dev_trans_start); |
443 | |
444 | static void netif_freeze_queues(struct net_device *dev) |
445 | { |
446 | unsigned int i; |
447 | int cpu; |
448 | |
449 | cpu = smp_processor_id(); |
450 | for (i = 0; i < dev->num_tx_queues; i++) { |
451 | struct netdev_queue *txq = netdev_get_tx_queue(dev, index: i); |
452 | |
453 | /* We are the only thread of execution doing a |
454 | * freeze, but we have to grab the _xmit_lock in |
455 | * order to synchronize with threads which are in |
456 | * the ->hard_start_xmit() handler and already |
457 | * checked the frozen bit. |
458 | */ |
459 | __netif_tx_lock(txq, cpu); |
460 | set_bit(nr: __QUEUE_STATE_FROZEN, addr: &txq->state); |
461 | __netif_tx_unlock(txq); |
462 | } |
463 | } |
464 | |
465 | void netif_tx_lock(struct net_device *dev) |
466 | { |
467 | spin_lock(lock: &dev->tx_global_lock); |
468 | netif_freeze_queues(dev); |
469 | } |
470 | EXPORT_SYMBOL(netif_tx_lock); |
471 | |
472 | static void netif_unfreeze_queues(struct net_device *dev) |
473 | { |
474 | unsigned int i; |
475 | |
476 | for (i = 0; i < dev->num_tx_queues; i++) { |
477 | struct netdev_queue *txq = netdev_get_tx_queue(dev, index: i); |
478 | |
479 | /* No need to grab the _xmit_lock here. If the |
480 | * queue is not stopped for another reason, we |
481 | * force a schedule. |
482 | */ |
483 | clear_bit(nr: __QUEUE_STATE_FROZEN, addr: &txq->state); |
484 | netif_schedule_queue(txq); |
485 | } |
486 | } |
487 | |
488 | void netif_tx_unlock(struct net_device *dev) |
489 | { |
490 | netif_unfreeze_queues(dev); |
491 | spin_unlock(lock: &dev->tx_global_lock); |
492 | } |
493 | EXPORT_SYMBOL(netif_tx_unlock); |
494 | |
495 | static void dev_watchdog(struct timer_list *t) |
496 | { |
497 | struct net_device *dev = from_timer(dev, t, watchdog_timer); |
498 | bool release = true; |
499 | |
500 | spin_lock(lock: &dev->tx_global_lock); |
501 | if (!qdisc_tx_is_noop(dev)) { |
502 | if (netif_device_present(dev) && |
503 | netif_running(dev) && |
504 | netif_carrier_ok(dev)) { |
505 | unsigned int timedout_ms = 0; |
506 | unsigned int i; |
507 | unsigned long trans_start; |
508 | |
509 | for (i = 0; i < dev->num_tx_queues; i++) { |
510 | struct netdev_queue *txq; |
511 | |
512 | txq = netdev_get_tx_queue(dev, index: i); |
513 | trans_start = READ_ONCE(txq->trans_start); |
514 | if (netif_xmit_stopped(dev_queue: txq) && |
515 | time_after(jiffies, (trans_start + |
516 | dev->watchdog_timeo))) { |
517 | timedout_ms = jiffies_to_msecs(j: jiffies - trans_start); |
518 | atomic_long_inc(v: &txq->trans_timeout); |
519 | break; |
520 | } |
521 | } |
522 | |
523 | if (unlikely(timedout_ms)) { |
524 | trace_net_dev_xmit_timeout(dev, queue_index: i); |
525 | WARN_ONCE(1, "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out %u ms\n" , |
526 | dev->name, netdev_drivername(dev), i, timedout_ms); |
527 | netif_freeze_queues(dev); |
528 | dev->netdev_ops->ndo_tx_timeout(dev, i); |
529 | netif_unfreeze_queues(dev); |
530 | } |
531 | if (!mod_timer(timer: &dev->watchdog_timer, |
532 | expires: round_jiffies(j: jiffies + |
533 | dev->watchdog_timeo))) |
534 | release = false; |
535 | } |
536 | } |
537 | spin_unlock(lock: &dev->tx_global_lock); |
538 | |
539 | if (release) |
540 | netdev_put(dev, tracker: &dev->watchdog_dev_tracker); |
541 | } |
542 | |
543 | void __netdev_watchdog_up(struct net_device *dev) |
544 | { |
545 | if (dev->netdev_ops->ndo_tx_timeout) { |
546 | if (dev->watchdog_timeo <= 0) |
547 | dev->watchdog_timeo = 5*HZ; |
548 | if (!mod_timer(timer: &dev->watchdog_timer, |
549 | expires: round_jiffies(j: jiffies + dev->watchdog_timeo))) |
550 | netdev_hold(dev, tracker: &dev->watchdog_dev_tracker, |
551 | GFP_ATOMIC); |
552 | } |
553 | } |
554 | EXPORT_SYMBOL_GPL(__netdev_watchdog_up); |
555 | |
556 | static void dev_watchdog_up(struct net_device *dev) |
557 | { |
558 | __netdev_watchdog_up(dev); |
559 | } |
560 | |
561 | static void dev_watchdog_down(struct net_device *dev) |
562 | { |
563 | netif_tx_lock_bh(dev); |
564 | if (del_timer(timer: &dev->watchdog_timer)) |
565 | netdev_put(dev, tracker: &dev->watchdog_dev_tracker); |
566 | netif_tx_unlock_bh(dev); |
567 | } |
568 | |
569 | /** |
570 | * netif_carrier_on - set carrier |
571 | * @dev: network device |
572 | * |
573 | * Device has detected acquisition of carrier. |
574 | */ |
575 | void netif_carrier_on(struct net_device *dev) |
576 | { |
577 | if (test_and_clear_bit(nr: __LINK_STATE_NOCARRIER, addr: &dev->state)) { |
578 | if (dev->reg_state == NETREG_UNINITIALIZED) |
579 | return; |
580 | atomic_inc(v: &dev->carrier_up_count); |
581 | linkwatch_fire_event(dev); |
582 | if (netif_running(dev)) |
583 | __netdev_watchdog_up(dev); |
584 | } |
585 | } |
586 | EXPORT_SYMBOL(netif_carrier_on); |
587 | |
588 | /** |
589 | * netif_carrier_off - clear carrier |
590 | * @dev: network device |
591 | * |
592 | * Device has detected loss of carrier. |
593 | */ |
594 | void netif_carrier_off(struct net_device *dev) |
595 | { |
596 | if (!test_and_set_bit(nr: __LINK_STATE_NOCARRIER, addr: &dev->state)) { |
597 | if (dev->reg_state == NETREG_UNINITIALIZED) |
598 | return; |
599 | atomic_inc(v: &dev->carrier_down_count); |
600 | linkwatch_fire_event(dev); |
601 | } |
602 | } |
603 | EXPORT_SYMBOL(netif_carrier_off); |
604 | |
605 | /** |
606 | * netif_carrier_event - report carrier state event |
607 | * @dev: network device |
608 | * |
609 | * Device has detected a carrier event but the carrier state wasn't changed. |
610 | * Use in drivers when querying carrier state asynchronously, to avoid missing |
611 | * events (link flaps) if link recovers before it's queried. |
612 | */ |
613 | void netif_carrier_event(struct net_device *dev) |
614 | { |
615 | if (dev->reg_state == NETREG_UNINITIALIZED) |
616 | return; |
617 | atomic_inc(v: &dev->carrier_up_count); |
618 | atomic_inc(v: &dev->carrier_down_count); |
619 | linkwatch_fire_event(dev); |
620 | } |
621 | EXPORT_SYMBOL_GPL(netif_carrier_event); |
622 | |
623 | /* "NOOP" scheduler: the best scheduler, recommended for all interfaces |
624 | under all circumstances. It is difficult to invent anything faster or |
625 | cheaper. |
626 | */ |
627 | |
628 | static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, |
629 | struct sk_buff **to_free) |
630 | { |
631 | __qdisc_drop(skb, to_free); |
632 | return NET_XMIT_CN; |
633 | } |
634 | |
635 | static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) |
636 | { |
637 | return NULL; |
638 | } |
639 | |
640 | struct Qdisc_ops noop_qdisc_ops __read_mostly = { |
641 | .id = "noop" , |
642 | .priv_size = 0, |
643 | .enqueue = noop_enqueue, |
644 | .dequeue = noop_dequeue, |
645 | .peek = noop_dequeue, |
646 | .owner = THIS_MODULE, |
647 | }; |
648 | |
649 | static struct netdev_queue noop_netdev_queue = { |
650 | RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc), |
651 | RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc), |
652 | }; |
653 | |
654 | struct Qdisc noop_qdisc = { |
655 | .enqueue = noop_enqueue, |
656 | .dequeue = noop_dequeue, |
657 | .flags = TCQ_F_BUILTIN, |
658 | .ops = &noop_qdisc_ops, |
659 | .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), |
660 | .dev_queue = &noop_netdev_queue, |
661 | .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), |
662 | .gso_skb = { |
663 | .next = (struct sk_buff *)&noop_qdisc.gso_skb, |
664 | .prev = (struct sk_buff *)&noop_qdisc.gso_skb, |
665 | .qlen = 0, |
666 | .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock), |
667 | }, |
668 | .skb_bad_txq = { |
669 | .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq, |
670 | .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq, |
671 | .qlen = 0, |
672 | .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), |
673 | }, |
674 | }; |
675 | EXPORT_SYMBOL(noop_qdisc); |
676 | |
677 | static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt, |
678 | struct netlink_ext_ack *extack) |
679 | { |
680 | /* register_qdisc() assigns a default of noop_enqueue if unset, |
681 | * but __dev_queue_xmit() treats noqueue only as such |
682 | * if this is NULL - so clear it here. */ |
683 | qdisc->enqueue = NULL; |
684 | return 0; |
685 | } |
686 | |
687 | struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { |
688 | .id = "noqueue" , |
689 | .priv_size = 0, |
690 | .init = noqueue_init, |
691 | .enqueue = noop_enqueue, |
692 | .dequeue = noop_dequeue, |
693 | .peek = noop_dequeue, |
694 | .owner = THIS_MODULE, |
695 | }; |
696 | |
697 | const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = { |
698 | 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 |
699 | }; |
700 | EXPORT_SYMBOL(sch_default_prio2band); |
701 | |
702 | /* 3-band FIFO queue: old style, but should be a bit faster than |
703 | generic prio+fifo combination. |
704 | */ |
705 | |
706 | #define PFIFO_FAST_BANDS 3 |
707 | |
708 | /* |
709 | * Private data for a pfifo_fast scheduler containing: |
710 | * - rings for priority bands |
711 | */ |
712 | struct pfifo_fast_priv { |
713 | struct skb_array q[PFIFO_FAST_BANDS]; |
714 | }; |
715 | |
716 | static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, |
717 | int band) |
718 | { |
719 | return &priv->q[band]; |
720 | } |
721 | |
722 | static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, |
723 | struct sk_buff **to_free) |
724 | { |
725 | int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX]; |
726 | struct pfifo_fast_priv *priv = qdisc_priv(qdisc); |
727 | struct skb_array *q = band2list(priv, band); |
728 | unsigned int pkt_len = qdisc_pkt_len(skb); |
729 | int err; |
730 | |
731 | err = skb_array_produce(a: q, skb); |
732 | |
733 | if (unlikely(err)) { |
734 | if (qdisc_is_percpu_stats(q: qdisc)) |
735 | return qdisc_drop_cpu(skb, sch: qdisc, to_free); |
736 | else |
737 | return qdisc_drop(skb, sch: qdisc, to_free); |
738 | } |
739 | |
740 | qdisc_update_stats_at_enqueue(sch: qdisc, pkt_len); |
741 | return NET_XMIT_SUCCESS; |
742 | } |
743 | |
744 | static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) |
745 | { |
746 | struct pfifo_fast_priv *priv = qdisc_priv(qdisc); |
747 | struct sk_buff *skb = NULL; |
748 | bool need_retry = true; |
749 | int band; |
750 | |
751 | retry: |
752 | for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { |
753 | struct skb_array *q = band2list(priv, band); |
754 | |
755 | if (__skb_array_empty(a: q)) |
756 | continue; |
757 | |
758 | skb = __skb_array_consume(a: q); |
759 | } |
760 | if (likely(skb)) { |
761 | qdisc_update_stats_at_dequeue(sch: qdisc, skb); |
762 | } else if (need_retry && |
763 | READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) { |
764 | /* Delay clearing the STATE_MISSED here to reduce |
765 | * the overhead of the second spin_trylock() in |
766 | * qdisc_run_begin() and __netif_schedule() calling |
767 | * in qdisc_run_end(). |
768 | */ |
769 | clear_bit(nr: __QDISC_STATE_MISSED, addr: &qdisc->state); |
770 | clear_bit(nr: __QDISC_STATE_DRAINING, addr: &qdisc->state); |
771 | |
772 | /* Make sure dequeuing happens after clearing |
773 | * STATE_MISSED. |
774 | */ |
775 | smp_mb__after_atomic(); |
776 | |
777 | need_retry = false; |
778 | |
779 | goto retry; |
780 | } |
781 | |
782 | return skb; |
783 | } |
784 | |
785 | static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) |
786 | { |
787 | struct pfifo_fast_priv *priv = qdisc_priv(qdisc); |
788 | struct sk_buff *skb = NULL; |
789 | int band; |
790 | |
791 | for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { |
792 | struct skb_array *q = band2list(priv, band); |
793 | |
794 | skb = __skb_array_peek(a: q); |
795 | } |
796 | |
797 | return skb; |
798 | } |
799 | |
800 | static void pfifo_fast_reset(struct Qdisc *qdisc) |
801 | { |
802 | int i, band; |
803 | struct pfifo_fast_priv *priv = qdisc_priv(qdisc); |
804 | |
805 | for (band = 0; band < PFIFO_FAST_BANDS; band++) { |
806 | struct skb_array *q = band2list(priv, band); |
807 | struct sk_buff *skb; |
808 | |
809 | /* NULL ring is possible if destroy path is due to a failed |
810 | * skb_array_init() in pfifo_fast_init() case. |
811 | */ |
812 | if (!q->ring.queue) |
813 | continue; |
814 | |
815 | while ((skb = __skb_array_consume(a: q)) != NULL) |
816 | kfree_skb(skb); |
817 | } |
818 | |
819 | if (qdisc_is_percpu_stats(q: qdisc)) { |
820 | for_each_possible_cpu(i) { |
821 | struct gnet_stats_queue *q; |
822 | |
823 | q = per_cpu_ptr(qdisc->cpu_qstats, i); |
824 | q->backlog = 0; |
825 | q->qlen = 0; |
826 | } |
827 | } |
828 | } |
829 | |
830 | static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) |
831 | { |
832 | struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; |
833 | |
834 | memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1); |
835 | if (nla_put(skb, attrtype: TCA_OPTIONS, attrlen: sizeof(opt), data: &opt)) |
836 | goto nla_put_failure; |
837 | return skb->len; |
838 | |
839 | nla_put_failure: |
840 | return -1; |
841 | } |
842 | |
843 | static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt, |
844 | struct netlink_ext_ack *extack) |
845 | { |
846 | unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; |
847 | struct pfifo_fast_priv *priv = qdisc_priv(qdisc); |
848 | int prio; |
849 | |
850 | /* guard against zero length rings */ |
851 | if (!qlen) |
852 | return -EINVAL; |
853 | |
854 | for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { |
855 | struct skb_array *q = band2list(priv, band: prio); |
856 | int err; |
857 | |
858 | err = skb_array_init(a: q, size: qlen, GFP_KERNEL); |
859 | if (err) |
860 | return -ENOMEM; |
861 | } |
862 | |
863 | /* Can by-pass the queue discipline */ |
864 | qdisc->flags |= TCQ_F_CAN_BYPASS; |
865 | return 0; |
866 | } |
867 | |
868 | static void pfifo_fast_destroy(struct Qdisc *sch) |
869 | { |
870 | struct pfifo_fast_priv *priv = qdisc_priv(sch); |
871 | int prio; |
872 | |
873 | for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { |
874 | struct skb_array *q = band2list(priv, band: prio); |
875 | |
876 | /* NULL ring is possible if destroy path is due to a failed |
877 | * skb_array_init() in pfifo_fast_init() case. |
878 | */ |
879 | if (!q->ring.queue) |
880 | continue; |
881 | /* Destroy ring but no need to kfree_skb because a call to |
882 | * pfifo_fast_reset() has already done that work. |
883 | */ |
884 | ptr_ring_cleanup(r: &q->ring, NULL); |
885 | } |
886 | } |
887 | |
888 | static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, |
889 | unsigned int new_len) |
890 | { |
891 | struct pfifo_fast_priv *priv = qdisc_priv(sch); |
892 | struct skb_array *bands[PFIFO_FAST_BANDS]; |
893 | int prio; |
894 | |
895 | for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { |
896 | struct skb_array *q = band2list(priv, band: prio); |
897 | |
898 | bands[prio] = q; |
899 | } |
900 | |
901 | return skb_array_resize_multiple(rings: bands, PFIFO_FAST_BANDS, size: new_len, |
902 | GFP_KERNEL); |
903 | } |
904 | |
905 | struct Qdisc_ops pfifo_fast_ops __read_mostly = { |
906 | .id = "pfifo_fast" , |
907 | .priv_size = sizeof(struct pfifo_fast_priv), |
908 | .enqueue = pfifo_fast_enqueue, |
909 | .dequeue = pfifo_fast_dequeue, |
910 | .peek = pfifo_fast_peek, |
911 | .init = pfifo_fast_init, |
912 | .destroy = pfifo_fast_destroy, |
913 | .reset = pfifo_fast_reset, |
914 | .dump = pfifo_fast_dump, |
915 | .change_tx_queue_len = pfifo_fast_change_tx_queue_len, |
916 | .owner = THIS_MODULE, |
917 | .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, |
918 | }; |
919 | EXPORT_SYMBOL(pfifo_fast_ops); |
920 | |
921 | static struct lock_class_key qdisc_tx_busylock; |
922 | |
923 | struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, |
924 | const struct Qdisc_ops *ops, |
925 | struct netlink_ext_ack *extack) |
926 | { |
927 | struct Qdisc *sch; |
928 | unsigned int size = sizeof(*sch) + ops->priv_size; |
929 | int err = -ENOBUFS; |
930 | struct net_device *dev; |
931 | |
932 | if (!dev_queue) { |
933 | NL_SET_ERR_MSG(extack, "No device queue given" ); |
934 | err = -EINVAL; |
935 | goto errout; |
936 | } |
937 | |
938 | dev = dev_queue->dev; |
939 | sch = kzalloc_node(size, GFP_KERNEL, node: netdev_queue_numa_node_read(q: dev_queue)); |
940 | |
941 | if (!sch) |
942 | goto errout; |
943 | __skb_queue_head_init(list: &sch->gso_skb); |
944 | __skb_queue_head_init(list: &sch->skb_bad_txq); |
945 | gnet_stats_basic_sync_init(b: &sch->bstats); |
946 | spin_lock_init(&sch->q.lock); |
947 | |
948 | if (ops->static_flags & TCQ_F_CPUSTATS) { |
949 | sch->cpu_bstats = |
950 | netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); |
951 | if (!sch->cpu_bstats) |
952 | goto errout1; |
953 | |
954 | sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); |
955 | if (!sch->cpu_qstats) { |
956 | free_percpu(pdata: sch->cpu_bstats); |
957 | goto errout1; |
958 | } |
959 | } |
960 | |
961 | spin_lock_init(&sch->busylock); |
962 | lockdep_set_class(&sch->busylock, |
963 | dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); |
964 | |
965 | /* seqlock has the same scope of busylock, for NOLOCK qdisc */ |
966 | spin_lock_init(&sch->seqlock); |
967 | lockdep_set_class(&sch->seqlock, |
968 | dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); |
969 | |
970 | sch->ops = ops; |
971 | sch->flags = ops->static_flags; |
972 | sch->enqueue = ops->enqueue; |
973 | sch->dequeue = ops->dequeue; |
974 | sch->dev_queue = dev_queue; |
975 | netdev_hold(dev, tracker: &sch->dev_tracker, GFP_KERNEL); |
976 | refcount_set(r: &sch->refcnt, n: 1); |
977 | |
978 | return sch; |
979 | errout1: |
980 | kfree(objp: sch); |
981 | errout: |
982 | return ERR_PTR(error: err); |
983 | } |
984 | |
985 | struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, |
986 | const struct Qdisc_ops *ops, |
987 | unsigned int parentid, |
988 | struct netlink_ext_ack *extack) |
989 | { |
990 | struct Qdisc *sch; |
991 | |
992 | if (!try_module_get(module: ops->owner)) { |
993 | NL_SET_ERR_MSG(extack, "Failed to increase module reference counter" ); |
994 | return NULL; |
995 | } |
996 | |
997 | sch = qdisc_alloc(dev_queue, ops, extack); |
998 | if (IS_ERR(ptr: sch)) { |
999 | module_put(module: ops->owner); |
1000 | return NULL; |
1001 | } |
1002 | sch->parent = parentid; |
1003 | |
1004 | if (!ops->init || ops->init(sch, NULL, extack) == 0) { |
1005 | trace_qdisc_create(ops, dev: dev_queue->dev, parent: parentid); |
1006 | return sch; |
1007 | } |
1008 | |
1009 | qdisc_put(qdisc: sch); |
1010 | return NULL; |
1011 | } |
1012 | EXPORT_SYMBOL(qdisc_create_dflt); |
1013 | |
1014 | /* Under qdisc_lock(qdisc) and BH! */ |
1015 | |
1016 | void qdisc_reset(struct Qdisc *qdisc) |
1017 | { |
1018 | const struct Qdisc_ops *ops = qdisc->ops; |
1019 | |
1020 | trace_qdisc_reset(q: qdisc); |
1021 | |
1022 | if (ops->reset) |
1023 | ops->reset(qdisc); |
1024 | |
1025 | __skb_queue_purge(list: &qdisc->gso_skb); |
1026 | __skb_queue_purge(list: &qdisc->skb_bad_txq); |
1027 | |
1028 | qdisc->q.qlen = 0; |
1029 | qdisc->qstats.backlog = 0; |
1030 | } |
1031 | EXPORT_SYMBOL(qdisc_reset); |
1032 | |
1033 | void qdisc_free(struct Qdisc *qdisc) |
1034 | { |
1035 | if (qdisc_is_percpu_stats(q: qdisc)) { |
1036 | free_percpu(pdata: qdisc->cpu_bstats); |
1037 | free_percpu(pdata: qdisc->cpu_qstats); |
1038 | } |
1039 | |
1040 | kfree(objp: qdisc); |
1041 | } |
1042 | |
1043 | static void qdisc_free_cb(struct rcu_head *head) |
1044 | { |
1045 | struct Qdisc *q = container_of(head, struct Qdisc, rcu); |
1046 | |
1047 | qdisc_free(qdisc: q); |
1048 | } |
1049 | |
1050 | static void __qdisc_destroy(struct Qdisc *qdisc) |
1051 | { |
1052 | const struct Qdisc_ops *ops = qdisc->ops; |
1053 | |
1054 | #ifdef CONFIG_NET_SCHED |
1055 | qdisc_hash_del(q: qdisc); |
1056 | |
1057 | qdisc_put_stab(rtnl_dereference(qdisc->stab)); |
1058 | #endif |
1059 | gen_kill_estimator(ptr: &qdisc->rate_est); |
1060 | |
1061 | qdisc_reset(qdisc); |
1062 | |
1063 | if (ops->destroy) |
1064 | ops->destroy(qdisc); |
1065 | |
1066 | module_put(module: ops->owner); |
1067 | netdev_put(dev: qdisc_dev(qdisc), tracker: &qdisc->dev_tracker); |
1068 | |
1069 | trace_qdisc_destroy(q: qdisc); |
1070 | |
1071 | call_rcu(head: &qdisc->rcu, func: qdisc_free_cb); |
1072 | } |
1073 | |
1074 | void qdisc_destroy(struct Qdisc *qdisc) |
1075 | { |
1076 | if (qdisc->flags & TCQ_F_BUILTIN) |
1077 | return; |
1078 | |
1079 | __qdisc_destroy(qdisc); |
1080 | } |
1081 | |
1082 | void qdisc_put(struct Qdisc *qdisc) |
1083 | { |
1084 | if (!qdisc) |
1085 | return; |
1086 | |
1087 | if (qdisc->flags & TCQ_F_BUILTIN || |
1088 | !refcount_dec_and_test(r: &qdisc->refcnt)) |
1089 | return; |
1090 | |
1091 | __qdisc_destroy(qdisc); |
1092 | } |
1093 | EXPORT_SYMBOL(qdisc_put); |
1094 | |
1095 | /* Version of qdisc_put() that is called with rtnl mutex unlocked. |
1096 | * Intended to be used as optimization, this function only takes rtnl lock if |
1097 | * qdisc reference counter reached zero. |
1098 | */ |
1099 | |
1100 | void qdisc_put_unlocked(struct Qdisc *qdisc) |
1101 | { |
1102 | if (qdisc->flags & TCQ_F_BUILTIN || |
1103 | !refcount_dec_and_rtnl_lock(r: &qdisc->refcnt)) |
1104 | return; |
1105 | |
1106 | __qdisc_destroy(qdisc); |
1107 | rtnl_unlock(); |
1108 | } |
1109 | EXPORT_SYMBOL(qdisc_put_unlocked); |
1110 | |
1111 | /* Attach toplevel qdisc to device queue. */ |
1112 | struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, |
1113 | struct Qdisc *qdisc) |
1114 | { |
1115 | struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping); |
1116 | spinlock_t *root_lock; |
1117 | |
1118 | root_lock = qdisc_lock(qdisc: oqdisc); |
1119 | spin_lock_bh(lock: root_lock); |
1120 | |
1121 | /* ... and graft new one */ |
1122 | if (qdisc == NULL) |
1123 | qdisc = &noop_qdisc; |
1124 | rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); |
1125 | rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); |
1126 | |
1127 | spin_unlock_bh(lock: root_lock); |
1128 | |
1129 | return oqdisc; |
1130 | } |
1131 | EXPORT_SYMBOL(dev_graft_qdisc); |
1132 | |
1133 | static void shutdown_scheduler_queue(struct net_device *dev, |
1134 | struct netdev_queue *dev_queue, |
1135 | void *_qdisc_default) |
1136 | { |
1137 | struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); |
1138 | struct Qdisc *qdisc_default = _qdisc_default; |
1139 | |
1140 | if (qdisc) { |
1141 | rcu_assign_pointer(dev_queue->qdisc, qdisc_default); |
1142 | rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default); |
1143 | |
1144 | qdisc_put(qdisc); |
1145 | } |
1146 | } |
1147 | |
1148 | static void attach_one_default_qdisc(struct net_device *dev, |
1149 | struct netdev_queue *dev_queue, |
1150 | void *_unused) |
1151 | { |
1152 | struct Qdisc *qdisc; |
1153 | const struct Qdisc_ops *ops = default_qdisc_ops; |
1154 | |
1155 | if (dev->priv_flags & IFF_NO_QUEUE) |
1156 | ops = &noqueue_qdisc_ops; |
1157 | else if(dev->type == ARPHRD_CAN) |
1158 | ops = &pfifo_fast_ops; |
1159 | |
1160 | qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); |
1161 | if (!qdisc) |
1162 | return; |
1163 | |
1164 | if (!netif_is_multiqueue(dev)) |
1165 | qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; |
1166 | rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); |
1167 | } |
1168 | |
1169 | static void attach_default_qdiscs(struct net_device *dev) |
1170 | { |
1171 | struct netdev_queue *txq; |
1172 | struct Qdisc *qdisc; |
1173 | |
1174 | txq = netdev_get_tx_queue(dev, index: 0); |
1175 | |
1176 | if (!netif_is_multiqueue(dev) || |
1177 | dev->priv_flags & IFF_NO_QUEUE) { |
1178 | netdev_for_each_tx_queue(dev, f: attach_one_default_qdisc, NULL); |
1179 | qdisc = rtnl_dereference(txq->qdisc_sleeping); |
1180 | rcu_assign_pointer(dev->qdisc, qdisc); |
1181 | qdisc_refcount_inc(qdisc); |
1182 | } else { |
1183 | qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); |
1184 | if (qdisc) { |
1185 | rcu_assign_pointer(dev->qdisc, qdisc); |
1186 | qdisc->ops->attach(qdisc); |
1187 | } |
1188 | } |
1189 | qdisc = rtnl_dereference(dev->qdisc); |
1190 | |
1191 | /* Detect default qdisc setup/init failed and fallback to "noqueue" */ |
1192 | if (qdisc == &noop_qdisc) { |
1193 | netdev_warn(dev, format: "default qdisc (%s) fail, fallback to %s\n" , |
1194 | default_qdisc_ops->id, noqueue_qdisc_ops.id); |
1195 | netdev_for_each_tx_queue(dev, f: shutdown_scheduler_queue, arg: &noop_qdisc); |
1196 | dev->priv_flags |= IFF_NO_QUEUE; |
1197 | netdev_for_each_tx_queue(dev, f: attach_one_default_qdisc, NULL); |
1198 | qdisc = rtnl_dereference(txq->qdisc_sleeping); |
1199 | rcu_assign_pointer(dev->qdisc, qdisc); |
1200 | qdisc_refcount_inc(qdisc); |
1201 | dev->priv_flags ^= IFF_NO_QUEUE; |
1202 | } |
1203 | |
1204 | #ifdef CONFIG_NET_SCHED |
1205 | if (qdisc != &noop_qdisc) |
1206 | qdisc_hash_add(q: qdisc, invisible: false); |
1207 | #endif |
1208 | } |
1209 | |
1210 | static void transition_one_qdisc(struct net_device *dev, |
1211 | struct netdev_queue *dev_queue, |
1212 | void *_need_watchdog) |
1213 | { |
1214 | struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); |
1215 | int *need_watchdog_p = _need_watchdog; |
1216 | |
1217 | if (!(new_qdisc->flags & TCQ_F_BUILTIN)) |
1218 | clear_bit(nr: __QDISC_STATE_DEACTIVATED, addr: &new_qdisc->state); |
1219 | |
1220 | rcu_assign_pointer(dev_queue->qdisc, new_qdisc); |
1221 | if (need_watchdog_p) { |
1222 | WRITE_ONCE(dev_queue->trans_start, 0); |
1223 | *need_watchdog_p = 1; |
1224 | } |
1225 | } |
1226 | |
1227 | void dev_activate(struct net_device *dev) |
1228 | { |
1229 | int need_watchdog; |
1230 | |
1231 | /* No queueing discipline is attached to device; |
1232 | * create default one for devices, which need queueing |
1233 | * and noqueue_qdisc for virtual interfaces |
1234 | */ |
1235 | |
1236 | if (rtnl_dereference(dev->qdisc) == &noop_qdisc) |
1237 | attach_default_qdiscs(dev); |
1238 | |
1239 | if (!netif_carrier_ok(dev)) |
1240 | /* Delay activation until next carrier-on event */ |
1241 | return; |
1242 | |
1243 | need_watchdog = 0; |
1244 | netdev_for_each_tx_queue(dev, f: transition_one_qdisc, arg: &need_watchdog); |
1245 | if (dev_ingress_queue(dev)) |
1246 | transition_one_qdisc(dev, dev_queue: dev_ingress_queue(dev), NULL); |
1247 | |
1248 | if (need_watchdog) { |
1249 | netif_trans_update(dev); |
1250 | dev_watchdog_up(dev); |
1251 | } |
1252 | } |
1253 | EXPORT_SYMBOL(dev_activate); |
1254 | |
1255 | static void qdisc_deactivate(struct Qdisc *qdisc) |
1256 | { |
1257 | if (qdisc->flags & TCQ_F_BUILTIN) |
1258 | return; |
1259 | |
1260 | set_bit(nr: __QDISC_STATE_DEACTIVATED, addr: &qdisc->state); |
1261 | } |
1262 | |
1263 | static void dev_deactivate_queue(struct net_device *dev, |
1264 | struct netdev_queue *dev_queue, |
1265 | void *_qdisc_default) |
1266 | { |
1267 | struct Qdisc *qdisc_default = _qdisc_default; |
1268 | struct Qdisc *qdisc; |
1269 | |
1270 | qdisc = rtnl_dereference(dev_queue->qdisc); |
1271 | if (qdisc) { |
1272 | qdisc_deactivate(qdisc); |
1273 | rcu_assign_pointer(dev_queue->qdisc, qdisc_default); |
1274 | } |
1275 | } |
1276 | |
1277 | static void dev_reset_queue(struct net_device *dev, |
1278 | struct netdev_queue *dev_queue, |
1279 | void *_unused) |
1280 | { |
1281 | struct Qdisc *qdisc; |
1282 | bool nolock; |
1283 | |
1284 | qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); |
1285 | if (!qdisc) |
1286 | return; |
1287 | |
1288 | nolock = qdisc->flags & TCQ_F_NOLOCK; |
1289 | |
1290 | if (nolock) |
1291 | spin_lock_bh(lock: &qdisc->seqlock); |
1292 | spin_lock_bh(lock: qdisc_lock(qdisc)); |
1293 | |
1294 | qdisc_reset(qdisc); |
1295 | |
1296 | spin_unlock_bh(lock: qdisc_lock(qdisc)); |
1297 | if (nolock) { |
1298 | clear_bit(nr: __QDISC_STATE_MISSED, addr: &qdisc->state); |
1299 | clear_bit(nr: __QDISC_STATE_DRAINING, addr: &qdisc->state); |
1300 | spin_unlock_bh(lock: &qdisc->seqlock); |
1301 | } |
1302 | } |
1303 | |
1304 | static bool some_qdisc_is_busy(struct net_device *dev) |
1305 | { |
1306 | unsigned int i; |
1307 | |
1308 | for (i = 0; i < dev->num_tx_queues; i++) { |
1309 | struct netdev_queue *dev_queue; |
1310 | spinlock_t *root_lock; |
1311 | struct Qdisc *q; |
1312 | int val; |
1313 | |
1314 | dev_queue = netdev_get_tx_queue(dev, index: i); |
1315 | q = rtnl_dereference(dev_queue->qdisc_sleeping); |
1316 | |
1317 | root_lock = qdisc_lock(qdisc: q); |
1318 | spin_lock_bh(lock: root_lock); |
1319 | |
1320 | val = (qdisc_is_running(qdisc: q) || |
1321 | test_bit(__QDISC_STATE_SCHED, &q->state)); |
1322 | |
1323 | spin_unlock_bh(lock: root_lock); |
1324 | |
1325 | if (val) |
1326 | return true; |
1327 | } |
1328 | return false; |
1329 | } |
1330 | |
1331 | /** |
1332 | * dev_deactivate_many - deactivate transmissions on several devices |
1333 | * @head: list of devices to deactivate |
1334 | * |
1335 | * This function returns only when all outstanding transmissions |
1336 | * have completed, unless all devices are in dismantle phase. |
1337 | */ |
1338 | void dev_deactivate_many(struct list_head *head) |
1339 | { |
1340 | struct net_device *dev; |
1341 | |
1342 | list_for_each_entry(dev, head, close_list) { |
1343 | netdev_for_each_tx_queue(dev, f: dev_deactivate_queue, |
1344 | arg: &noop_qdisc); |
1345 | if (dev_ingress_queue(dev)) |
1346 | dev_deactivate_queue(dev, dev_queue: dev_ingress_queue(dev), |
1347 | qdisc_default: &noop_qdisc); |
1348 | |
1349 | dev_watchdog_down(dev); |
1350 | } |
1351 | |
1352 | /* Wait for outstanding qdisc-less dev_queue_xmit calls or |
1353 | * outstanding qdisc enqueuing calls. |
1354 | * This is avoided if all devices are in dismantle phase : |
1355 | * Caller will call synchronize_net() for us |
1356 | */ |
1357 | synchronize_net(); |
1358 | |
1359 | list_for_each_entry(dev, head, close_list) { |
1360 | netdev_for_each_tx_queue(dev, f: dev_reset_queue, NULL); |
1361 | |
1362 | if (dev_ingress_queue(dev)) |
1363 | dev_reset_queue(dev, dev_queue: dev_ingress_queue(dev), NULL); |
1364 | } |
1365 | |
1366 | /* Wait for outstanding qdisc_run calls. */ |
1367 | list_for_each_entry(dev, head, close_list) { |
1368 | while (some_qdisc_is_busy(dev)) { |
1369 | /* wait_event() would avoid this sleep-loop but would |
1370 | * require expensive checks in the fast paths of packet |
1371 | * processing which isn't worth it. |
1372 | */ |
1373 | schedule_timeout_uninterruptible(timeout: 1); |
1374 | } |
1375 | } |
1376 | } |
1377 | |
1378 | void dev_deactivate(struct net_device *dev) |
1379 | { |
1380 | LIST_HEAD(single); |
1381 | |
1382 | list_add(new: &dev->close_list, head: &single); |
1383 | dev_deactivate_many(head: &single); |
1384 | list_del(entry: &single); |
1385 | } |
1386 | EXPORT_SYMBOL(dev_deactivate); |
1387 | |
1388 | static int qdisc_change_tx_queue_len(struct net_device *dev, |
1389 | struct netdev_queue *dev_queue) |
1390 | { |
1391 | struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); |
1392 | const struct Qdisc_ops *ops = qdisc->ops; |
1393 | |
1394 | if (ops->change_tx_queue_len) |
1395 | return ops->change_tx_queue_len(qdisc, dev->tx_queue_len); |
1396 | return 0; |
1397 | } |
1398 | |
1399 | void dev_qdisc_change_real_num_tx(struct net_device *dev, |
1400 | unsigned int new_real_tx) |
1401 | { |
1402 | struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); |
1403 | |
1404 | if (qdisc->ops->change_real_num_tx) |
1405 | qdisc->ops->change_real_num_tx(qdisc, new_real_tx); |
1406 | } |
1407 | |
1408 | void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx) |
1409 | { |
1410 | #ifdef CONFIG_NET_SCHED |
1411 | struct net_device *dev = qdisc_dev(qdisc: sch); |
1412 | struct Qdisc *qdisc; |
1413 | unsigned int i; |
1414 | |
1415 | for (i = new_real_tx; i < dev->real_num_tx_queues; i++) { |
1416 | qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); |
1417 | /* Only update the default qdiscs we created, |
1418 | * qdiscs with handles are always hashed. |
1419 | */ |
1420 | if (qdisc != &noop_qdisc && !qdisc->handle) |
1421 | qdisc_hash_del(q: qdisc); |
1422 | } |
1423 | for (i = dev->real_num_tx_queues; i < new_real_tx; i++) { |
1424 | qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); |
1425 | if (qdisc != &noop_qdisc && !qdisc->handle) |
1426 | qdisc_hash_add(q: qdisc, invisible: false); |
1427 | } |
1428 | #endif |
1429 | } |
1430 | EXPORT_SYMBOL(mq_change_real_num_tx); |
1431 | |
1432 | int dev_qdisc_change_tx_queue_len(struct net_device *dev) |
1433 | { |
1434 | bool up = dev->flags & IFF_UP; |
1435 | unsigned int i; |
1436 | int ret = 0; |
1437 | |
1438 | if (up) |
1439 | dev_deactivate(dev); |
1440 | |
1441 | for (i = 0; i < dev->num_tx_queues; i++) { |
1442 | ret = qdisc_change_tx_queue_len(dev, dev_queue: &dev->_tx[i]); |
1443 | |
1444 | /* TODO: revert changes on a partial failure */ |
1445 | if (ret) |
1446 | break; |
1447 | } |
1448 | |
1449 | if (up) |
1450 | dev_activate(dev); |
1451 | return ret; |
1452 | } |
1453 | |
1454 | static void dev_init_scheduler_queue(struct net_device *dev, |
1455 | struct netdev_queue *dev_queue, |
1456 | void *_qdisc) |
1457 | { |
1458 | struct Qdisc *qdisc = _qdisc; |
1459 | |
1460 | rcu_assign_pointer(dev_queue->qdisc, qdisc); |
1461 | rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); |
1462 | } |
1463 | |
1464 | void dev_init_scheduler(struct net_device *dev) |
1465 | { |
1466 | rcu_assign_pointer(dev->qdisc, &noop_qdisc); |
1467 | netdev_for_each_tx_queue(dev, f: dev_init_scheduler_queue, arg: &noop_qdisc); |
1468 | if (dev_ingress_queue(dev)) |
1469 | dev_init_scheduler_queue(dev, dev_queue: dev_ingress_queue(dev), qdisc: &noop_qdisc); |
1470 | |
1471 | timer_setup(&dev->watchdog_timer, dev_watchdog, 0); |
1472 | } |
1473 | |
1474 | void dev_shutdown(struct net_device *dev) |
1475 | { |
1476 | netdev_for_each_tx_queue(dev, f: shutdown_scheduler_queue, arg: &noop_qdisc); |
1477 | if (dev_ingress_queue(dev)) |
1478 | shutdown_scheduler_queue(dev, dev_queue: dev_ingress_queue(dev), qdisc_default: &noop_qdisc); |
1479 | qdisc_put(rtnl_dereference(dev->qdisc)); |
1480 | rcu_assign_pointer(dev->qdisc, &noop_qdisc); |
1481 | |
1482 | WARN_ON(timer_pending(&dev->watchdog_timer)); |
1483 | } |
1484 | |
1485 | /** |
1486 | * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division |
1487 | * @rate: Rate to compute reciprocal division values of |
1488 | * @mult: Multiplier for reciprocal division |
1489 | * @shift: Shift for reciprocal division |
1490 | * |
1491 | * The multiplier and shift for reciprocal division by rate are stored |
1492 | * in mult and shift. |
1493 | * |
1494 | * The deal here is to replace a divide by a reciprocal one |
1495 | * in fast path (a reciprocal divide is a multiply and a shift) |
1496 | * |
1497 | * Normal formula would be : |
1498 | * time_in_ns = (NSEC_PER_SEC * len) / rate_bps |
1499 | * |
1500 | * We compute mult/shift to use instead : |
1501 | * time_in_ns = (len * mult) >> shift; |
1502 | * |
1503 | * We try to get the highest possible mult value for accuracy, |
1504 | * but have to make sure no overflows will ever happen. |
1505 | * |
1506 | * reciprocal_value() is not used here it doesn't handle 64-bit values. |
1507 | */ |
1508 | static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift) |
1509 | { |
1510 | u64 factor = NSEC_PER_SEC; |
1511 | |
1512 | *mult = 1; |
1513 | *shift = 0; |
1514 | |
1515 | if (rate <= 0) |
1516 | return; |
1517 | |
1518 | for (;;) { |
1519 | *mult = div64_u64(dividend: factor, divisor: rate); |
1520 | if (*mult & (1U << 31) || factor & (1ULL << 63)) |
1521 | break; |
1522 | factor <<= 1; |
1523 | (*shift)++; |
1524 | } |
1525 | } |
1526 | |
1527 | void psched_ratecfg_precompute(struct psched_ratecfg *r, |
1528 | const struct tc_ratespec *conf, |
1529 | u64 rate64) |
1530 | { |
1531 | memset(r, 0, sizeof(*r)); |
1532 | r->overhead = conf->overhead; |
1533 | r->mpu = conf->mpu; |
1534 | r->rate_bytes_ps = max_t(u64, conf->rate, rate64); |
1535 | r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); |
1536 | psched_ratecfg_precompute__(rate: r->rate_bytes_ps, mult: &r->mult, shift: &r->shift); |
1537 | } |
1538 | EXPORT_SYMBOL(psched_ratecfg_precompute); |
1539 | |
1540 | void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) |
1541 | { |
1542 | r->rate_pkts_ps = pktrate64; |
1543 | psched_ratecfg_precompute__(rate: r->rate_pkts_ps, mult: &r->mult, shift: &r->shift); |
1544 | } |
1545 | EXPORT_SYMBOL(psched_ppscfg_precompute); |
1546 | |
1547 | void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, |
1548 | struct tcf_proto *tp_head) |
1549 | { |
1550 | /* Protected with chain0->filter_chain_lock. |
1551 | * Can't access chain directly because tp_head can be NULL. |
1552 | */ |
1553 | struct mini_Qdisc *miniq_old = |
1554 | rcu_dereference_protected(*miniqp->p_miniq, 1); |
1555 | struct mini_Qdisc *miniq; |
1556 | |
1557 | if (!tp_head) { |
1558 | RCU_INIT_POINTER(*miniqp->p_miniq, NULL); |
1559 | } else { |
1560 | miniq = miniq_old != &miniqp->miniq1 ? |
1561 | &miniqp->miniq1 : &miniqp->miniq2; |
1562 | |
1563 | /* We need to make sure that readers won't see the miniq |
1564 | * we are about to modify. So ensure that at least one RCU |
1565 | * grace period has elapsed since the miniq was made |
1566 | * inactive. |
1567 | */ |
1568 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
1569 | cond_synchronize_rcu(oldstate: miniq->rcu_state); |
1570 | else if (!poll_state_synchronize_rcu(oldstate: miniq->rcu_state)) |
1571 | synchronize_rcu_expedited(); |
1572 | |
1573 | miniq->filter_list = tp_head; |
1574 | rcu_assign_pointer(*miniqp->p_miniq, miniq); |
1575 | } |
1576 | |
1577 | if (miniq_old) |
1578 | /* This is counterpart of the rcu sync above. We need to |
1579 | * block potential new user of miniq_old until all readers |
1580 | * are not seeing it. |
1581 | */ |
1582 | miniq_old->rcu_state = start_poll_synchronize_rcu(); |
1583 | } |
1584 | EXPORT_SYMBOL(mini_qdisc_pair_swap); |
1585 | |
1586 | void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, |
1587 | struct tcf_block *block) |
1588 | { |
1589 | miniqp->miniq1.block = block; |
1590 | miniqp->miniq2.block = block; |
1591 | } |
1592 | EXPORT_SYMBOL(mini_qdisc_pair_block_init); |
1593 | |
1594 | void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, |
1595 | struct mini_Qdisc __rcu **p_miniq) |
1596 | { |
1597 | miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats; |
1598 | miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; |
1599 | miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; |
1600 | miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; |
1601 | miniqp->miniq1.rcu_state = get_state_synchronize_rcu(); |
1602 | miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state; |
1603 | miniqp->p_miniq = p_miniq; |
1604 | } |
1605 | EXPORT_SYMBOL(mini_qdisc_pair_init); |
1606 | |