1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright (c) 2019 Facebook |
3 | * |
4 | * This program is free software; you can redistribute it and/or |
5 | * modify it under the terms of version 2 of the GNU General Public |
6 | * License as published by the Free Software Foundation. |
7 | * |
8 | * Example program for Host Bandwidth Managment |
9 | * |
10 | * This program loads a cgroup skb BPF program to enforce cgroup output |
11 | * (egress) or input (ingress) bandwidth limits. |
12 | * |
13 | * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] |
14 | * Where: |
15 | * -d Print BPF trace debug buffer |
16 | * -l Also limit flows doing loopback |
17 | * -n <#> To create cgroup \"/hbm#\" and attach prog |
18 | * Default is /hbm1 |
19 | * --no_cn Do not return cn notifications |
20 | * -r <rate> Rate limit in Mbps |
21 | * -s Get HBM stats (marked, dropped, etc.) |
22 | * -t <time> Exit after specified seconds (default is 0) |
23 | * -w Work conserving flag. cgroup can increase its bandwidth |
24 | * beyond the rate limit specified while there is available |
25 | * bandwidth. Current implementation assumes there is only |
26 | * NIC (eth0), but can be extended to support multiple NICs. |
27 | * Currrently only supported for egress. |
28 | * -h Print this info |
29 | * prog BPF program file name. Name defaults to hbm_out_kern.o |
30 | */ |
31 | |
32 | #define _GNU_SOURCE |
33 | |
34 | #include <stdio.h> |
35 | #include <stdlib.h> |
36 | #include <assert.h> |
37 | #include <sys/time.h> |
38 | #include <unistd.h> |
39 | #include <errno.h> |
40 | #include <fcntl.h> |
41 | #include <linux/unistd.h> |
42 | #include <linux/compiler.h> |
43 | |
44 | #include <linux/bpf.h> |
45 | #include <bpf/bpf.h> |
46 | #include <getopt.h> |
47 | |
48 | #include "cgroup_helpers.h" |
49 | #include "hbm.h" |
50 | #include "bpf_util.h" |
51 | #include <bpf/libbpf.h> |
52 | |
53 | bool outFlag = true; |
54 | int minRate = 1000; /* cgroup rate limit in Mbps */ |
55 | int rate = 1000; /* can grow if rate conserving is enabled */ |
56 | int dur = 1; |
57 | bool stats_flag; |
58 | bool loopback_flag; |
59 | bool debugFlag; |
60 | bool work_conserving_flag; |
61 | bool no_cn_flag; |
62 | bool edt_flag; |
63 | |
64 | static void Usage(void); |
65 | static void read_trace_pipe2(void); |
66 | static void do_error(char *msg, bool errno_flag); |
67 | |
68 | #define TRACEFS "/sys/kernel/tracing/" |
69 | |
70 | static struct bpf_program *bpf_prog; |
71 | static struct bpf_object *obj; |
72 | static int queue_stats_fd; |
73 | |
74 | static void read_trace_pipe2(void) |
75 | { |
76 | int trace_fd; |
77 | FILE *outf; |
78 | char *outFname = "hbm_out.log" ; |
79 | |
80 | trace_fd = open(TRACEFS "trace_pipe" , O_RDONLY, 0); |
81 | if (trace_fd < 0) { |
82 | printf("Error opening trace_pipe\n" ); |
83 | return; |
84 | } |
85 | |
86 | // Future support of ingress |
87 | // if (!outFlag) |
88 | // outFname = "hbm_in.log"; |
89 | outf = fopen(outFname, "w" ); |
90 | |
91 | if (outf == NULL) |
92 | printf("Error creating %s\n" , outFname); |
93 | |
94 | while (1) { |
95 | static char buf[4097]; |
96 | ssize_t sz; |
97 | |
98 | sz = read(trace_fd, buf, sizeof(buf) - 1); |
99 | if (sz > 0) { |
100 | buf[sz] = 0; |
101 | puts(buf); |
102 | if (outf != NULL) { |
103 | fprintf(outf, "%s\n" , buf); |
104 | fflush(outf); |
105 | } |
106 | } |
107 | } |
108 | } |
109 | |
110 | static void do_error(char *msg, bool errno_flag) |
111 | { |
112 | if (errno_flag) |
113 | printf("ERROR: %s, errno: %d\n" , msg, errno); |
114 | else |
115 | printf("ERROR: %s\n" , msg); |
116 | exit(1); |
117 | } |
118 | |
119 | static int prog_load(char *prog) |
120 | { |
121 | struct bpf_program *pos; |
122 | const char *sec_name; |
123 | |
124 | obj = bpf_object__open_file(prog, NULL); |
125 | if (libbpf_get_error(obj)) { |
126 | printf("ERROR: opening BPF object file failed\n" ); |
127 | return 1; |
128 | } |
129 | |
130 | /* load BPF program */ |
131 | if (bpf_object__load(obj)) { |
132 | printf("ERROR: loading BPF object file failed\n" ); |
133 | goto err; |
134 | } |
135 | |
136 | bpf_object__for_each_program(pos, obj) { |
137 | sec_name = bpf_program__section_name(pos); |
138 | if (sec_name && !strcmp(sec_name, "cgroup_skb/egress" )) { |
139 | bpf_prog = pos; |
140 | break; |
141 | } |
142 | } |
143 | if (!bpf_prog) { |
144 | printf("ERROR: finding a prog in obj file failed\n" ); |
145 | goto err; |
146 | } |
147 | |
148 | queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats" ); |
149 | if (queue_stats_fd < 0) { |
150 | printf("ERROR: finding a map in obj file failed\n" ); |
151 | goto err; |
152 | } |
153 | |
154 | return 0; |
155 | |
156 | err: |
157 | bpf_object__close(obj); |
158 | return 1; |
159 | } |
160 | |
161 | static int run_bpf_prog(char *prog, int cg_id) |
162 | { |
163 | struct hbm_queue_stats qstats = {0}; |
164 | char cg_dir[100], cg_pin_path[100]; |
165 | struct bpf_link *link = NULL; |
166 | int key = 0; |
167 | int cg1 = 0; |
168 | int rc = 0; |
169 | |
170 | sprintf(buf: cg_dir, fmt: "/hbm%d" , cg_id); |
171 | rc = prog_load(prog); |
172 | if (rc != 0) |
173 | return rc; |
174 | |
175 | if (setup_cgroup_environment()) { |
176 | printf("ERROR: setting cgroup environment\n" ); |
177 | goto err; |
178 | } |
179 | cg1 = create_and_get_cgroup(cg_dir); |
180 | if (!cg1) { |
181 | printf("ERROR: create_and_get_cgroup\n" ); |
182 | goto err; |
183 | } |
184 | if (join_cgroup(cg_dir)) { |
185 | printf("ERROR: join_cgroup\n" ); |
186 | goto err; |
187 | } |
188 | |
189 | qstats.rate = rate; |
190 | qstats.stats = stats_flag ? 1 : 0; |
191 | qstats.loopback = loopback_flag ? 1 : 0; |
192 | qstats.no_cn = no_cn_flag ? 1 : 0; |
193 | if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) { |
194 | printf("ERROR: Could not update map element\n" ); |
195 | goto err; |
196 | } |
197 | |
198 | if (!outFlag) |
199 | bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS); |
200 | |
201 | link = bpf_program__attach_cgroup(bpf_prog, cg1); |
202 | if (libbpf_get_error(link)) { |
203 | fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n" ); |
204 | goto err; |
205 | } |
206 | |
207 | sprintf(buf: cg_pin_path, fmt: "/sys/fs/bpf/hbm%d" , cg_id); |
208 | rc = bpf_link__pin(link, cg_pin_path); |
209 | if (rc < 0) { |
210 | printf("ERROR: bpf_link__pin failed: %d\n" , rc); |
211 | goto err; |
212 | } |
213 | |
214 | if (work_conserving_flag) { |
215 | struct timeval t0, t_last, t_new; |
216 | FILE *fin; |
217 | unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; |
218 | signed long long last_cg_tx_bytes, new_cg_tx_bytes; |
219 | signed long long delta_time, delta_bytes, delta_rate; |
220 | int delta_ms; |
221 | #define DELTA_RATE_CHECK 10000 /* in us */ |
222 | #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ |
223 | |
224 | bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); |
225 | if (gettimeofday(&t0, NULL) < 0) |
226 | do_error(msg: "gettimeofday failed" , errno_flag: true); |
227 | t_last = t0; |
228 | fin = fopen("/sys/class/net/eth0/statistics/tx_bytes" , "r" ); |
229 | if (fscanf(fin, "%llu" , &last_eth_tx_bytes) != 1) |
230 | do_error(msg: "fscanf fails" , errno_flag: false); |
231 | fclose(fin); |
232 | last_cg_tx_bytes = qstats.bytes_total; |
233 | while (true) { |
234 | usleep(DELTA_RATE_CHECK); |
235 | if (gettimeofday(&t_new, NULL) < 0) |
236 | do_error(msg: "gettimeofday failed" , errno_flag: true); |
237 | delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + |
238 | (t_new.tv_usec - t0.tv_usec)/1000; |
239 | if (delta_ms > dur * 1000) |
240 | break; |
241 | delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + |
242 | (t_new.tv_usec - t_last.tv_usec); |
243 | if (delta_time == 0) |
244 | continue; |
245 | t_last = t_new; |
246 | fin = fopen("/sys/class/net/eth0/statistics/tx_bytes" , |
247 | "r" ); |
248 | if (fscanf(fin, "%llu" , &new_eth_tx_bytes) != 1) |
249 | do_error(msg: "fscanf fails" , errno_flag: false); |
250 | fclose(fin); |
251 | printf(" new_eth_tx_bytes:%llu\n" , |
252 | new_eth_tx_bytes); |
253 | bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); |
254 | new_cg_tx_bytes = qstats.bytes_total; |
255 | delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; |
256 | last_eth_tx_bytes = new_eth_tx_bytes; |
257 | delta_rate = (delta_bytes * 8000000) / delta_time; |
258 | printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps" , |
259 | delta_ms, delta_rate/1000000000.0, |
260 | rate/1000.0); |
261 | if (delta_rate < RATE_THRESHOLD) { |
262 | /* can increase cgroup rate limit, but first |
263 | * check if we are using the current limit. |
264 | * Currently increasing by 6.25%, unknown |
265 | * if that is the optimal rate. |
266 | */ |
267 | int rate_diff100; |
268 | |
269 | delta_bytes = new_cg_tx_bytes - |
270 | last_cg_tx_bytes; |
271 | last_cg_tx_bytes = new_cg_tx_bytes; |
272 | delta_rate = (delta_bytes * 8000000) / |
273 | delta_time; |
274 | printf(" rate:%.3fGbps" , |
275 | delta_rate/1000000000.0); |
276 | rate_diff100 = (((long long)rate)*1000000 - |
277 | delta_rate) * 100 / |
278 | (((long long) rate) * 1000000); |
279 | printf(" rdiff:%d" , rate_diff100); |
280 | if (rate_diff100 <= 3) { |
281 | rate += (rate >> 4); |
282 | if (rate > RATE_THRESHOLD / 1000000) |
283 | rate = RATE_THRESHOLD / 1000000; |
284 | qstats.rate = rate; |
285 | printf(" INC\n" ); |
286 | } else { |
287 | printf("\n" ); |
288 | } |
289 | } else { |
290 | /* Need to decrease cgroup rate limit. |
291 | * Currently decreasing by 12.5%, unknown |
292 | * if that is optimal |
293 | */ |
294 | printf(" DEC\n" ); |
295 | rate -= (rate >> 3); |
296 | if (rate < minRate) |
297 | rate = minRate; |
298 | qstats.rate = rate; |
299 | } |
300 | if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) |
301 | do_error(msg: "update map element fails" , errno_flag: false); |
302 | } |
303 | } else { |
304 | sleep(dur); |
305 | } |
306 | // Get stats! |
307 | if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) { |
308 | char fname[100]; |
309 | FILE *fout; |
310 | |
311 | if (!outFlag) |
312 | sprintf(buf: fname, fmt: "hbm.%d.in" , cg_id); |
313 | else |
314 | sprintf(buf: fname, fmt: "hbm.%d.out" , cg_id); |
315 | fout = fopen(fname, "w" ); |
316 | fprintf(fout, "id:%d\n" , cg_id); |
317 | fprintf(fout, "ERROR: Could not lookup queue_stats\n" ); |
318 | fclose(fout); |
319 | } else if (stats_flag && qstats.lastPacketTime > |
320 | qstats.firstPacketTime) { |
321 | long long delta_us = (qstats.lastPacketTime - |
322 | qstats.firstPacketTime)/1000; |
323 | unsigned int rate_mbps = ((qstats.bytes_total - |
324 | qstats.bytes_dropped) * 8 / |
325 | delta_us); |
326 | double percent_pkts, percent_bytes; |
327 | char fname[100]; |
328 | FILE *fout; |
329 | int k; |
330 | static const char *returnValNames[] = { |
331 | "DROP_PKT" , |
332 | "ALLOW_PKT" , |
333 | "DROP_PKT_CWR" , |
334 | "ALLOW_PKT_CWR" |
335 | }; |
336 | #define RET_VAL_COUNT 4 |
337 | |
338 | // Future support of ingress |
339 | // if (!outFlag) |
340 | // sprintf(fname, "hbm.%d.in", cg_id); |
341 | // else |
342 | sprintf(buf: fname, fmt: "hbm.%d.out" , cg_id); |
343 | fout = fopen(fname, "w" ); |
344 | fprintf(fout, "id:%d\n" , cg_id); |
345 | fprintf(fout, "rate_mbps:%d\n" , rate_mbps); |
346 | fprintf(fout, "duration:%.1f secs\n" , |
347 | (qstats.lastPacketTime - qstats.firstPacketTime) / |
348 | 1000000000.0); |
349 | fprintf(fout, "packets:%d\n" , (int)qstats.pkts_total); |
350 | fprintf(fout, "bytes_MB:%d\n" , (int)(qstats.bytes_total / |
351 | 1000000)); |
352 | fprintf(fout, "pkts_dropped:%d\n" , (int)qstats.pkts_dropped); |
353 | fprintf(fout, "bytes_dropped_MB:%d\n" , |
354 | (int)(qstats.bytes_dropped / |
355 | 1000000)); |
356 | // Marked Pkts and Bytes |
357 | percent_pkts = (qstats.pkts_marked * 100.0) / |
358 | (qstats.pkts_total + 1); |
359 | percent_bytes = (qstats.bytes_marked * 100.0) / |
360 | (qstats.bytes_total + 1); |
361 | fprintf(fout, "pkts_marked_percent:%6.2f\n" , percent_pkts); |
362 | fprintf(fout, "bytes_marked_percent:%6.2f\n" , percent_bytes); |
363 | |
364 | // Dropped Pkts and Bytes |
365 | percent_pkts = (qstats.pkts_dropped * 100.0) / |
366 | (qstats.pkts_total + 1); |
367 | percent_bytes = (qstats.bytes_dropped * 100.0) / |
368 | (qstats.bytes_total + 1); |
369 | fprintf(fout, "pkts_dropped_percent:%6.2f\n" , percent_pkts); |
370 | fprintf(fout, "bytes_dropped_percent:%6.2f\n" , percent_bytes); |
371 | |
372 | // ECN CE markings |
373 | percent_pkts = (qstats.pkts_ecn_ce * 100.0) / |
374 | (qstats.pkts_total + 1); |
375 | fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n" , percent_pkts, |
376 | (int)qstats.pkts_ecn_ce); |
377 | |
378 | // Average cwnd |
379 | fprintf(fout, "avg cwnd:%d\n" , |
380 | (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); |
381 | // Average rtt |
382 | fprintf(fout, "avg rtt:%d\n" , |
383 | (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); |
384 | // Average credit |
385 | if (edt_flag) |
386 | fprintf(fout, "avg credit_ms:%.03f\n" , |
387 | (qstats.sum_credit / |
388 | (qstats.pkts_total + 1.0)) / 1000000.0); |
389 | else |
390 | fprintf(fout, "avg credit:%d\n" , |
391 | (int)(qstats.sum_credit / |
392 | (1500 * ((int)qstats.pkts_total ) + 1))); |
393 | |
394 | // Return values stats |
395 | for (k = 0; k < RET_VAL_COUNT; k++) { |
396 | percent_pkts = (qstats.returnValCount[k] * 100.0) / |
397 | (qstats.pkts_total + 1); |
398 | fprintf(fout, "%s:%6.2f (%d)\n" , returnValNames[k], |
399 | percent_pkts, (int)qstats.returnValCount[k]); |
400 | } |
401 | fclose(fout); |
402 | } |
403 | |
404 | if (debugFlag) |
405 | read_trace_pipe2(); |
406 | goto cleanup; |
407 | |
408 | err: |
409 | rc = 1; |
410 | |
411 | cleanup: |
412 | bpf_link__destroy(link); |
413 | bpf_object__close(obj); |
414 | |
415 | if (cg1 != -1) |
416 | close(cg1); |
417 | |
418 | if (rc != 0) |
419 | cleanup_cgroup_environment(); |
420 | return rc; |
421 | } |
422 | |
423 | static void Usage(void) |
424 | { |
425 | printf("This program loads a cgroup skb BPF program to enforce\n" |
426 | "cgroup output (egress) bandwidth limits.\n\n" |
427 | "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" |
428 | " [-s] [-t <secs>] [-w] [-h] [prog]\n" |
429 | " Where:\n" |
430 | " -o indicates egress direction (default)\n" |
431 | " -d print BPF trace debug buffer\n" |
432 | " --edt use fq's Earliest Departure Time\n" |
433 | " -l also limit flows using loopback\n" |
434 | " -n <#> to create cgroup \"/hbm#\" and attach prog\n" |
435 | " Default is /hbm1\n" |
436 | " --no_cn disable CN notifications\n" |
437 | " -r <rate> Rate in Mbps\n" |
438 | " -s Update HBM stats\n" |
439 | " -t <time> Exit after specified seconds (default is 0)\n" |
440 | " -w Work conserving flag. cgroup can increase\n" |
441 | " bandwidth beyond the rate limit specified\n" |
442 | " while there is available bandwidth. Current\n" |
443 | " implementation assumes there is only eth0\n" |
444 | " but can be extended to support multiple NICs\n" |
445 | " -h print this info\n" |
446 | " prog BPF program file name. Name defaults to\n" |
447 | " hbm_out_kern.o\n" ); |
448 | } |
449 | |
450 | int main(int argc, char **argv) |
451 | { |
452 | char *prog = "hbm_out_kern.o" ; |
453 | int k; |
454 | int cg_id = 1; |
455 | char *optstring = "iodln:r:st:wh" ; |
456 | struct option loptions[] = { |
457 | {"no_cn" , 0, NULL, 1}, |
458 | {"edt" , 0, NULL, 2}, |
459 | {NULL, 0, NULL, 0} |
460 | }; |
461 | |
462 | while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { |
463 | switch (k) { |
464 | case 1: |
465 | no_cn_flag = true; |
466 | break; |
467 | case 2: |
468 | prog = "hbm_edt_kern.o" ; |
469 | edt_flag = true; |
470 | break; |
471 | case'o': |
472 | break; |
473 | case 'd': |
474 | debugFlag = true; |
475 | break; |
476 | case 'l': |
477 | loopback_flag = true; |
478 | break; |
479 | case 'n': |
480 | cg_id = atoi(optarg); |
481 | break; |
482 | case 'r': |
483 | minRate = atoi(optarg) * 1.024; |
484 | rate = minRate; |
485 | break; |
486 | case 's': |
487 | stats_flag = true; |
488 | break; |
489 | case 't': |
490 | dur = atoi(optarg); |
491 | break; |
492 | case 'w': |
493 | work_conserving_flag = true; |
494 | break; |
495 | case '?': |
496 | if (optopt == 'n' || optopt == 'r' || optopt == 't') |
497 | fprintf(stderr, |
498 | "Option -%c requires an argument.\n\n" , |
499 | optopt); |
500 | case 'h': |
501 | default: |
502 | Usage(); |
503 | return 0; |
504 | } |
505 | } |
506 | |
507 | if (optind < argc) |
508 | prog = argv[optind]; |
509 | printf("HBM prog: %s\n" , prog != NULL ? prog : "NULL" ); |
510 | |
511 | /* Use libbpf 1.0 API mode */ |
512 | libbpf_set_strict_mode(LIBBPF_STRICT_ALL); |
513 | |
514 | return run_bpf_prog(prog, cg_id); |
515 | } |
516 | |