1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * net/sched/sch_cbs.c Credit Based Shaper |
4 | * |
5 | * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> |
6 | */ |
7 | |
8 | /* Credit Based Shaper (CBS) |
9 | * ========================= |
10 | * |
11 | * This is a simple rate-limiting shaper aimed at TSN applications on |
12 | * systems with known traffic workloads. |
13 | * |
14 | * Its algorithm is defined by the IEEE 802.1Q-2014 Specification, |
15 | * Section 8.6.8.2, and explained in more detail in the Annex L of the |
16 | * same specification. |
17 | * |
18 | * There are four tunables to be considered: |
19 | * |
20 | * 'idleslope': Idleslope is the rate of credits that is |
21 | * accumulated (in kilobits per second) when there is at least |
22 | * one packet waiting for transmission. Packets are transmitted |
23 | * when the current value of credits is equal or greater than |
24 | * zero. When there is no packet to be transmitted the amount of |
25 | * credits is set to zero. This is the main tunable of the CBS |
26 | * algorithm. |
27 | * |
28 | * 'sendslope': |
29 | * Sendslope is the rate of credits that is depleted (it should be a |
30 | * negative number of kilobits per second) when a transmission is |
31 | * ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section |
32 | * 8.6.8.2 item g): |
33 | * |
34 | * sendslope = idleslope - port_transmit_rate |
35 | * |
36 | * 'hicredit': Hicredit defines the maximum amount of credits (in |
37 | * bytes) that can be accumulated. Hicredit depends on the |
38 | * characteristics of interfering traffic, |
39 | * 'max_interference_size' is the maximum size of any burst of |
40 | * traffic that can delay the transmission of a frame that is |
41 | * available for transmission for this traffic class, (IEEE |
42 | * 802.1Q-2014 Annex L, Equation L-3): |
43 | * |
44 | * hicredit = max_interference_size * (idleslope / port_transmit_rate) |
45 | * |
46 | * 'locredit': Locredit is the minimum amount of credits that can |
47 | * be reached. It is a function of the traffic flowing through |
48 | * this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2): |
49 | * |
50 | * locredit = max_frame_size * (sendslope / port_transmit_rate) |
51 | */ |
52 | |
53 | #include <linux/ethtool.h> |
54 | #include <linux/module.h> |
55 | #include <linux/types.h> |
56 | #include <linux/kernel.h> |
57 | #include <linux/string.h> |
58 | #include <linux/errno.h> |
59 | #include <linux/skbuff.h> |
60 | #include <net/netevent.h> |
61 | #include <net/netlink.h> |
62 | #include <net/sch_generic.h> |
63 | #include <net/pkt_sched.h> |
64 | |
65 | static LIST_HEAD(cbs_list); |
66 | static DEFINE_SPINLOCK(cbs_list_lock); |
67 | |
68 | #define BYTES_PER_KBIT (1000LL / 8) |
69 | |
70 | struct cbs_sched_data { |
71 | bool offload; |
72 | int queue; |
73 | atomic64_t port_rate; /* in bytes/s */ |
74 | s64 last; /* timestamp in ns */ |
75 | s64 credits; /* in bytes */ |
76 | s32 locredit; /* in bytes */ |
77 | s32 hicredit; /* in bytes */ |
78 | s64 sendslope; /* in bytes/s */ |
79 | s64 idleslope; /* in bytes/s */ |
80 | struct qdisc_watchdog watchdog; |
81 | int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, |
82 | struct sk_buff **to_free); |
83 | struct sk_buff *(*dequeue)(struct Qdisc *sch); |
84 | struct Qdisc *qdisc; |
85 | struct list_head cbs_list; |
86 | }; |
87 | |
88 | static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, |
89 | struct Qdisc *child, |
90 | struct sk_buff **to_free) |
91 | { |
92 | unsigned int len = qdisc_pkt_len(skb); |
93 | int err; |
94 | |
95 | err = child->ops->enqueue(skb, child, to_free); |
96 | if (err != NET_XMIT_SUCCESS) |
97 | return err; |
98 | |
99 | sch->qstats.backlog += len; |
100 | sch->q.qlen++; |
101 | |
102 | return NET_XMIT_SUCCESS; |
103 | } |
104 | |
105 | static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch, |
106 | struct sk_buff **to_free) |
107 | { |
108 | struct cbs_sched_data *q = qdisc_priv(sch); |
109 | struct Qdisc *qdisc = q->qdisc; |
110 | |
111 | return cbs_child_enqueue(skb, sch, child: qdisc, to_free); |
112 | } |
113 | |
114 | static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch, |
115 | struct sk_buff **to_free) |
116 | { |
117 | struct cbs_sched_data *q = qdisc_priv(sch); |
118 | struct Qdisc *qdisc = q->qdisc; |
119 | |
120 | if (sch->q.qlen == 0 && q->credits > 0) { |
121 | /* We need to stop accumulating credits when there's |
122 | * no enqueued packets and q->credits is positive. |
123 | */ |
124 | q->credits = 0; |
125 | q->last = ktime_get_ns(); |
126 | } |
127 | |
128 | return cbs_child_enqueue(skb, sch, child: qdisc, to_free); |
129 | } |
130 | |
131 | static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, |
132 | struct sk_buff **to_free) |
133 | { |
134 | struct cbs_sched_data *q = qdisc_priv(sch); |
135 | |
136 | return q->enqueue(skb, sch, to_free); |
137 | } |
138 | |
139 | /* timediff is in ns, slope is in bytes/s */ |
140 | static s64 timediff_to_credits(s64 timediff, s64 slope) |
141 | { |
142 | return div64_s64(dividend: timediff * slope, NSEC_PER_SEC); |
143 | } |
144 | |
145 | static s64 delay_from_credits(s64 credits, s64 slope) |
146 | { |
147 | if (unlikely(slope == 0)) |
148 | return S64_MAX; |
149 | |
150 | return div64_s64(dividend: -credits * NSEC_PER_SEC, divisor: slope); |
151 | } |
152 | |
153 | static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate) |
154 | { |
155 | if (unlikely(port_rate == 0)) |
156 | return S64_MAX; |
157 | |
158 | return div64_s64(dividend: len * slope, divisor: port_rate); |
159 | } |
160 | |
161 | static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child) |
162 | { |
163 | struct sk_buff *skb; |
164 | |
165 | skb = child->ops->dequeue(child); |
166 | if (!skb) |
167 | return NULL; |
168 | |
169 | qdisc_qstats_backlog_dec(sch, skb); |
170 | qdisc_bstats_update(sch, skb); |
171 | sch->q.qlen--; |
172 | |
173 | return skb; |
174 | } |
175 | |
176 | static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) |
177 | { |
178 | struct cbs_sched_data *q = qdisc_priv(sch); |
179 | struct Qdisc *qdisc = q->qdisc; |
180 | s64 now = ktime_get_ns(); |
181 | struct sk_buff *skb; |
182 | s64 credits; |
183 | int len; |
184 | |
185 | /* The previous packet is still being sent */ |
186 | if (now < q->last) { |
187 | qdisc_watchdog_schedule_ns(wd: &q->watchdog, expires: q->last); |
188 | return NULL; |
189 | } |
190 | if (q->credits < 0) { |
191 | credits = timediff_to_credits(timediff: now - q->last, slope: q->idleslope); |
192 | |
193 | credits = q->credits + credits; |
194 | q->credits = min_t(s64, credits, q->hicredit); |
195 | |
196 | if (q->credits < 0) { |
197 | s64 delay; |
198 | |
199 | delay = delay_from_credits(credits: q->credits, slope: q->idleslope); |
200 | qdisc_watchdog_schedule_ns(wd: &q->watchdog, expires: now + delay); |
201 | |
202 | q->last = now; |
203 | |
204 | return NULL; |
205 | } |
206 | } |
207 | skb = cbs_child_dequeue(sch, child: qdisc); |
208 | if (!skb) |
209 | return NULL; |
210 | |
211 | len = qdisc_pkt_len(skb); |
212 | |
213 | /* As sendslope is a negative number, this will decrease the |
214 | * amount of q->credits. |
215 | */ |
216 | credits = credits_from_len(len, slope: q->sendslope, |
217 | port_rate: atomic64_read(v: &q->port_rate)); |
218 | credits += q->credits; |
219 | |
220 | q->credits = max_t(s64, credits, q->locredit); |
221 | /* Estimate of the transmission of the last byte of the packet in ns */ |
222 | if (unlikely(atomic64_read(&q->port_rate) == 0)) |
223 | q->last = now; |
224 | else |
225 | q->last = now + div64_s64(dividend: len * NSEC_PER_SEC, |
226 | divisor: atomic64_read(v: &q->port_rate)); |
227 | |
228 | return skb; |
229 | } |
230 | |
231 | static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch) |
232 | { |
233 | struct cbs_sched_data *q = qdisc_priv(sch); |
234 | struct Qdisc *qdisc = q->qdisc; |
235 | |
236 | return cbs_child_dequeue(sch, child: qdisc); |
237 | } |
238 | |
239 | static struct sk_buff *cbs_dequeue(struct Qdisc *sch) |
240 | { |
241 | struct cbs_sched_data *q = qdisc_priv(sch); |
242 | |
243 | return q->dequeue(sch); |
244 | } |
245 | |
246 | static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { |
247 | [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, |
248 | }; |
249 | |
250 | static void cbs_disable_offload(struct net_device *dev, |
251 | struct cbs_sched_data *q) |
252 | { |
253 | struct tc_cbs_qopt_offload cbs = { }; |
254 | const struct net_device_ops *ops; |
255 | int err; |
256 | |
257 | if (!q->offload) |
258 | return; |
259 | |
260 | q->enqueue = cbs_enqueue_soft; |
261 | q->dequeue = cbs_dequeue_soft; |
262 | |
263 | ops = dev->netdev_ops; |
264 | if (!ops->ndo_setup_tc) |
265 | return; |
266 | |
267 | cbs.queue = q->queue; |
268 | cbs.enable = 0; |
269 | |
270 | err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); |
271 | if (err < 0) |
272 | pr_warn("Couldn't disable CBS offload for queue %d\n" , |
273 | cbs.queue); |
274 | } |
275 | |
276 | static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q, |
277 | const struct tc_cbs_qopt *opt, |
278 | struct netlink_ext_ack *extack) |
279 | { |
280 | const struct net_device_ops *ops = dev->netdev_ops; |
281 | struct tc_cbs_qopt_offload cbs = { }; |
282 | int err; |
283 | |
284 | if (!ops->ndo_setup_tc) { |
285 | NL_SET_ERR_MSG(extack, "Specified device does not support cbs offload" ); |
286 | return -EOPNOTSUPP; |
287 | } |
288 | |
289 | cbs.queue = q->queue; |
290 | |
291 | cbs.enable = 1; |
292 | cbs.hicredit = opt->hicredit; |
293 | cbs.locredit = opt->locredit; |
294 | cbs.idleslope = opt->idleslope; |
295 | cbs.sendslope = opt->sendslope; |
296 | |
297 | err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs); |
298 | if (err < 0) { |
299 | NL_SET_ERR_MSG(extack, "Specified device failed to setup cbs hardware offload" ); |
300 | return err; |
301 | } |
302 | |
303 | q->enqueue = cbs_enqueue_offload; |
304 | q->dequeue = cbs_dequeue_offload; |
305 | |
306 | return 0; |
307 | } |
308 | |
309 | static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) |
310 | { |
311 | struct ethtool_link_ksettings ecmd; |
312 | int speed = SPEED_10; |
313 | int port_rate; |
314 | int err; |
315 | |
316 | err = __ethtool_get_link_ksettings(dev, link_ksettings: &ecmd); |
317 | if (err < 0) |
318 | goto skip; |
319 | |
320 | if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) |
321 | speed = ecmd.base.speed; |
322 | |
323 | skip: |
324 | port_rate = speed * 1000 * BYTES_PER_KBIT; |
325 | |
326 | atomic64_set(v: &q->port_rate, i: port_rate); |
327 | netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n" , |
328 | dev->name, (long long)atomic64_read(&q->port_rate), |
329 | ecmd.base.speed); |
330 | } |
331 | |
332 | static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event, |
333 | void *ptr) |
334 | { |
335 | struct net_device *dev = netdev_notifier_info_to_dev(info: ptr); |
336 | struct cbs_sched_data *q; |
337 | struct net_device *qdev; |
338 | bool found = false; |
339 | |
340 | ASSERT_RTNL(); |
341 | |
342 | if (event != NETDEV_UP && event != NETDEV_CHANGE) |
343 | return NOTIFY_DONE; |
344 | |
345 | spin_lock(lock: &cbs_list_lock); |
346 | list_for_each_entry(q, &cbs_list, cbs_list) { |
347 | qdev = qdisc_dev(qdisc: q->qdisc); |
348 | if (qdev == dev) { |
349 | found = true; |
350 | break; |
351 | } |
352 | } |
353 | spin_unlock(lock: &cbs_list_lock); |
354 | |
355 | if (found) |
356 | cbs_set_port_rate(dev, q); |
357 | |
358 | return NOTIFY_DONE; |
359 | } |
360 | |
361 | static int cbs_change(struct Qdisc *sch, struct nlattr *opt, |
362 | struct netlink_ext_ack *extack) |
363 | { |
364 | struct cbs_sched_data *q = qdisc_priv(sch); |
365 | struct net_device *dev = qdisc_dev(qdisc: sch); |
366 | struct nlattr *tb[TCA_CBS_MAX + 1]; |
367 | struct tc_cbs_qopt *qopt; |
368 | int err; |
369 | |
370 | err = nla_parse_nested_deprecated(tb, TCA_CBS_MAX, nla: opt, policy: cbs_policy, |
371 | extack); |
372 | if (err < 0) |
373 | return err; |
374 | |
375 | if (!tb[TCA_CBS_PARMS]) { |
376 | NL_SET_ERR_MSG(extack, "Missing CBS parameter which are mandatory" ); |
377 | return -EINVAL; |
378 | } |
379 | |
380 | qopt = nla_data(nla: tb[TCA_CBS_PARMS]); |
381 | |
382 | if (!qopt->offload) { |
383 | cbs_set_port_rate(dev, q); |
384 | cbs_disable_offload(dev, q); |
385 | } else { |
386 | err = cbs_enable_offload(dev, q, opt: qopt, extack); |
387 | if (err < 0) |
388 | return err; |
389 | } |
390 | |
391 | /* Everything went OK, save the parameters used. */ |
392 | q->hicredit = qopt->hicredit; |
393 | q->locredit = qopt->locredit; |
394 | q->idleslope = qopt->idleslope * BYTES_PER_KBIT; |
395 | q->sendslope = qopt->sendslope * BYTES_PER_KBIT; |
396 | q->offload = qopt->offload; |
397 | |
398 | return 0; |
399 | } |
400 | |
401 | static int cbs_init(struct Qdisc *sch, struct nlattr *opt, |
402 | struct netlink_ext_ack *extack) |
403 | { |
404 | struct cbs_sched_data *q = qdisc_priv(sch); |
405 | struct net_device *dev = qdisc_dev(qdisc: sch); |
406 | |
407 | if (!opt) { |
408 | NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory" ); |
409 | return -EINVAL; |
410 | } |
411 | |
412 | q->qdisc = qdisc_create_dflt(dev_queue: sch->dev_queue, ops: &pfifo_qdisc_ops, |
413 | parentid: sch->handle, extack); |
414 | if (!q->qdisc) |
415 | return -ENOMEM; |
416 | |
417 | spin_lock(lock: &cbs_list_lock); |
418 | list_add(new: &q->cbs_list, head: &cbs_list); |
419 | spin_unlock(lock: &cbs_list_lock); |
420 | |
421 | qdisc_hash_add(q: q->qdisc, invisible: false); |
422 | |
423 | q->queue = sch->dev_queue - netdev_get_tx_queue(dev, index: 0); |
424 | |
425 | q->enqueue = cbs_enqueue_soft; |
426 | q->dequeue = cbs_dequeue_soft; |
427 | |
428 | qdisc_watchdog_init(wd: &q->watchdog, qdisc: sch); |
429 | |
430 | return cbs_change(sch, opt, extack); |
431 | } |
432 | |
433 | static void cbs_destroy(struct Qdisc *sch) |
434 | { |
435 | struct cbs_sched_data *q = qdisc_priv(sch); |
436 | struct net_device *dev = qdisc_dev(qdisc: sch); |
437 | |
438 | /* Nothing to do if we couldn't create the underlying qdisc */ |
439 | if (!q->qdisc) |
440 | return; |
441 | |
442 | qdisc_watchdog_cancel(wd: &q->watchdog); |
443 | cbs_disable_offload(dev, q); |
444 | |
445 | spin_lock(lock: &cbs_list_lock); |
446 | list_del(entry: &q->cbs_list); |
447 | spin_unlock(lock: &cbs_list_lock); |
448 | |
449 | qdisc_put(qdisc: q->qdisc); |
450 | } |
451 | |
452 | static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) |
453 | { |
454 | struct cbs_sched_data *q = qdisc_priv(sch); |
455 | struct tc_cbs_qopt opt = { }; |
456 | struct nlattr *nest; |
457 | |
458 | nest = nla_nest_start_noflag(skb, attrtype: TCA_OPTIONS); |
459 | if (!nest) |
460 | goto nla_put_failure; |
461 | |
462 | opt.hicredit = q->hicredit; |
463 | opt.locredit = q->locredit; |
464 | opt.sendslope = div64_s64(dividend: q->sendslope, BYTES_PER_KBIT); |
465 | opt.idleslope = div64_s64(dividend: q->idleslope, BYTES_PER_KBIT); |
466 | opt.offload = q->offload; |
467 | |
468 | if (nla_put(skb, attrtype: TCA_CBS_PARMS, attrlen: sizeof(opt), data: &opt)) |
469 | goto nla_put_failure; |
470 | |
471 | return nla_nest_end(skb, start: nest); |
472 | |
473 | nla_put_failure: |
474 | nla_nest_cancel(skb, start: nest); |
475 | return -1; |
476 | } |
477 | |
478 | static int cbs_dump_class(struct Qdisc *sch, unsigned long cl, |
479 | struct sk_buff *skb, struct tcmsg *tcm) |
480 | { |
481 | struct cbs_sched_data *q = qdisc_priv(sch); |
482 | |
483 | if (cl != 1 || !q->qdisc) /* only one class */ |
484 | return -ENOENT; |
485 | |
486 | tcm->tcm_handle |= TC_H_MIN(1); |
487 | tcm->tcm_info = q->qdisc->handle; |
488 | |
489 | return 0; |
490 | } |
491 | |
492 | static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, |
493 | struct Qdisc **old, struct netlink_ext_ack *extack) |
494 | { |
495 | struct cbs_sched_data *q = qdisc_priv(sch); |
496 | |
497 | if (!new) { |
498 | new = qdisc_create_dflt(dev_queue: sch->dev_queue, ops: &pfifo_qdisc_ops, |
499 | parentid: sch->handle, NULL); |
500 | if (!new) |
501 | new = &noop_qdisc; |
502 | } |
503 | |
504 | *old = qdisc_replace(sch, new, pold: &q->qdisc); |
505 | return 0; |
506 | } |
507 | |
508 | static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg) |
509 | { |
510 | struct cbs_sched_data *q = qdisc_priv(sch); |
511 | |
512 | return q->qdisc; |
513 | } |
514 | |
515 | static unsigned long cbs_find(struct Qdisc *sch, u32 classid) |
516 | { |
517 | return 1; |
518 | } |
519 | |
520 | static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker) |
521 | { |
522 | if (!walker->stop) { |
523 | tc_qdisc_stats_dump(sch, cl: 1, arg: walker); |
524 | } |
525 | } |
526 | |
527 | static const struct Qdisc_class_ops cbs_class_ops = { |
528 | .graft = cbs_graft, |
529 | .leaf = cbs_leaf, |
530 | .find = cbs_find, |
531 | .walk = cbs_walk, |
532 | .dump = cbs_dump_class, |
533 | }; |
534 | |
535 | static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { |
536 | .id = "cbs" , |
537 | .cl_ops = &cbs_class_ops, |
538 | .priv_size = sizeof(struct cbs_sched_data), |
539 | .enqueue = cbs_enqueue, |
540 | .dequeue = cbs_dequeue, |
541 | .peek = qdisc_peek_dequeued, |
542 | .init = cbs_init, |
543 | .reset = qdisc_reset_queue, |
544 | .destroy = cbs_destroy, |
545 | .change = cbs_change, |
546 | .dump = cbs_dump, |
547 | .owner = THIS_MODULE, |
548 | }; |
549 | |
550 | static struct notifier_block cbs_device_notifier = { |
551 | .notifier_call = cbs_dev_notifier, |
552 | }; |
553 | |
554 | static int __init cbs_module_init(void) |
555 | { |
556 | int err; |
557 | |
558 | err = register_netdevice_notifier(nb: &cbs_device_notifier); |
559 | if (err) |
560 | return err; |
561 | |
562 | err = register_qdisc(qops: &cbs_qdisc_ops); |
563 | if (err) |
564 | unregister_netdevice_notifier(nb: &cbs_device_notifier); |
565 | |
566 | return err; |
567 | } |
568 | |
569 | static void __exit cbs_module_exit(void) |
570 | { |
571 | unregister_qdisc(qops: &cbs_qdisc_ops); |
572 | unregister_netdevice_notifier(nb: &cbs_device_notifier); |
573 | } |
574 | module_init(cbs_module_init) |
575 | module_exit(cbs_module_exit) |
576 | MODULE_LICENSE("GPL" ); |
577 | |