1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | |
3 | /* WARNING: This implemenation is not necessarily the same |
4 | * as the tcp_cubic.c. The purpose is mainly for testing |
5 | * the kernel BPF logic. |
6 | * |
7 | * Highlights: |
8 | * 1. CONFIG_HZ .kconfig map is used. |
9 | * 2. In bictcp_update(), calculation is changed to use usec |
10 | * resolution (i.e. USEC_PER_JIFFY) instead of using jiffies. |
11 | * Thus, usecs_to_jiffies() is not used in the bpf_cubic.c. |
12 | * 3. In bitctcp_update() [under tcp_friendliness], the original |
13 | * "while (ca->ack_cnt > delta)" loop is changed to the equivalent |
14 | * "ca->ack_cnt / delta" operation. |
15 | */ |
16 | |
17 | #include <linux/bpf.h> |
18 | #include <linux/stddef.h> |
19 | #include <linux/tcp.h> |
20 | #include "bpf_tcp_helpers.h" |
21 | |
22 | char _license[] SEC("license" ) = "GPL" ; |
23 | |
24 | #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) |
25 | |
26 | #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation |
27 | * max_cwnd = snd_cwnd * beta |
28 | */ |
29 | #define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ |
30 | |
31 | /* Two methods of hybrid slow start */ |
32 | #define HYSTART_ACK_TRAIN 0x1 |
33 | #define HYSTART_DELAY 0x2 |
34 | |
35 | /* Number of delay samples for detecting the increase of delay */ |
36 | #define HYSTART_MIN_SAMPLES 8 |
37 | #define HYSTART_DELAY_MIN (4000U) /* 4ms */ |
38 | #define HYSTART_DELAY_MAX (16000U) /* 16 ms */ |
39 | #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) |
40 | |
41 | static int fast_convergence = 1; |
42 | static const int beta = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */ |
43 | static int initial_ssthresh; |
44 | static const int bic_scale = 41; |
45 | static int tcp_friendliness = 1; |
46 | |
47 | static int hystart = 1; |
48 | static int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY; |
49 | static int hystart_low_window = 16; |
50 | static int hystart_ack_delta_us = 2000; |
51 | |
52 | static const __u32 cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */ |
53 | static const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3 |
54 | / (BICTCP_BETA_SCALE - beta); |
55 | /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 |
56 | * so K = cubic_root( (wmax-cwnd)*rtt/c ) |
57 | * the unit of K is bictcp_HZ=2^10, not HZ |
58 | * |
59 | * c = bic_scale >> 10 |
60 | * rtt = 100ms |
61 | * |
62 | * the following code has been designed and tested for |
63 | * cwnd < 1 million packets |
64 | * RTT < 100 seconds |
65 | * HZ < 1,000,00 (corresponding to 10 nano-second) |
66 | */ |
67 | |
68 | /* 1/c * 2^2*bictcp_HZ * srtt, 2^40 */ |
69 | static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ)) |
70 | / (bic_scale * 10); |
71 | |
72 | /* BIC TCP Parameters */ |
73 | struct bictcp { |
74 | __u32 cnt; /* increase cwnd by 1 after ACKs */ |
75 | __u32 last_max_cwnd; /* last maximum snd_cwnd */ |
76 | __u32 last_cwnd; /* the last snd_cwnd */ |
77 | __u32 last_time; /* time when updated last_cwnd */ |
78 | __u32 bic_origin_point;/* origin point of bic function */ |
79 | __u32 bic_K; /* time to origin point |
80 | from the beginning of the current epoch */ |
81 | __u32 delay_min; /* min delay (usec) */ |
82 | __u32 epoch_start; /* beginning of an epoch */ |
83 | __u32 ack_cnt; /* number of acks */ |
84 | __u32 tcp_cwnd; /* estimated tcp cwnd */ |
85 | __u16 unused; |
86 | __u8 sample_cnt; /* number of samples to decide curr_rtt */ |
87 | __u8 found; /* the exit point is found? */ |
88 | __u32 round_start; /* beginning of each round */ |
89 | __u32 end_seq; /* end_seq of the round */ |
90 | __u32 last_ack; /* last time when the ACK spacing is close */ |
91 | __u32 curr_rtt; /* the minimum rtt of current round */ |
92 | }; |
93 | |
94 | static inline void bictcp_reset(struct bictcp *ca) |
95 | { |
96 | ca->cnt = 0; |
97 | ca->last_max_cwnd = 0; |
98 | ca->last_cwnd = 0; |
99 | ca->last_time = 0; |
100 | ca->bic_origin_point = 0; |
101 | ca->bic_K = 0; |
102 | ca->delay_min = 0; |
103 | ca->epoch_start = 0; |
104 | ca->ack_cnt = 0; |
105 | ca->tcp_cwnd = 0; |
106 | ca->found = 0; |
107 | } |
108 | |
109 | extern unsigned long CONFIG_HZ __kconfig; |
110 | #define HZ CONFIG_HZ |
111 | #define USEC_PER_MSEC 1000UL |
112 | #define USEC_PER_SEC 1000000UL |
113 | #define USEC_PER_JIFFY (USEC_PER_SEC / HZ) |
114 | |
115 | static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor) |
116 | { |
117 | return dividend / divisor; |
118 | } |
119 | |
120 | #define div64_ul div64_u64 |
121 | |
122 | #define BITS_PER_U64 (sizeof(__u64) * 8) |
123 | static __always_inline int fls64(__u64 x) |
124 | { |
125 | int num = BITS_PER_U64 - 1; |
126 | |
127 | if (x == 0) |
128 | return 0; |
129 | |
130 | if (!(x & (~0ull << (BITS_PER_U64-32)))) { |
131 | num -= 32; |
132 | x <<= 32; |
133 | } |
134 | if (!(x & (~0ull << (BITS_PER_U64-16)))) { |
135 | num -= 16; |
136 | x <<= 16; |
137 | } |
138 | if (!(x & (~0ull << (BITS_PER_U64-8)))) { |
139 | num -= 8; |
140 | x <<= 8; |
141 | } |
142 | if (!(x & (~0ull << (BITS_PER_U64-4)))) { |
143 | num -= 4; |
144 | x <<= 4; |
145 | } |
146 | if (!(x & (~0ull << (BITS_PER_U64-2)))) { |
147 | num -= 2; |
148 | x <<= 2; |
149 | } |
150 | if (!(x & (~0ull << (BITS_PER_U64-1)))) |
151 | num -= 1; |
152 | |
153 | return num + 1; |
154 | } |
155 | |
156 | static __always_inline __u32 bictcp_clock_us(const struct sock *sk) |
157 | { |
158 | return tcp_sk(sk)->tcp_mstamp; |
159 | } |
160 | |
161 | static __always_inline void bictcp_hystart_reset(struct sock *sk) |
162 | { |
163 | struct tcp_sock *tp = tcp_sk(sk); |
164 | struct bictcp *ca = inet_csk_ca(sk); |
165 | |
166 | ca->round_start = ca->last_ack = bictcp_clock_us(sk); |
167 | ca->end_seq = tp->snd_nxt; |
168 | ca->curr_rtt = ~0U; |
169 | ca->sample_cnt = 0; |
170 | } |
171 | |
172 | /* "struct_ops/" prefix is a requirement */ |
173 | SEC("struct_ops/bpf_cubic_init" ) |
174 | void BPF_PROG(bpf_cubic_init, struct sock *sk) |
175 | { |
176 | struct bictcp *ca = inet_csk_ca(sk: sk); |
177 | |
178 | bictcp_reset(ca); |
179 | |
180 | if (hystart) |
181 | bictcp_hystart_reset(sk: sk); |
182 | |
183 | if (!hystart && initial_ssthresh) |
184 | tcp_sk(sk)->snd_ssthresh = initial_ssthresh; |
185 | } |
186 | |
187 | /* "struct_ops" prefix is a requirement */ |
188 | SEC("struct_ops/bpf_cubic_cwnd_event" ) |
189 | void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) |
190 | { |
191 | if (event == CA_EVENT_TX_START) { |
192 | struct bictcp *ca = inet_csk_ca(sk: sk); |
193 | __u32 now = tcp_jiffies32; |
194 | __s32 delta; |
195 | |
196 | delta = now - tcp_sk(sk)->lsndtime; |
197 | |
198 | /* We were application limited (idle) for a while. |
199 | * Shift epoch_start to keep cwnd growth to cubic curve. |
200 | */ |
201 | if (ca->epoch_start && delta > 0) { |
202 | ca->epoch_start += delta; |
203 | if (after(ca->epoch_start, now)) |
204 | ca->epoch_start = now; |
205 | } |
206 | return; |
207 | } |
208 | } |
209 | |
210 | /* |
211 | * cbrt(x) MSB values for x MSB values in [0..63]. |
212 | * Precomputed then refined by hand - Willy Tarreau |
213 | * |
214 | * For x in [0..63], |
215 | * v = cbrt(x << 18) - 1 |
216 | * cbrt(x) = (v[x] + 10) >> 6 |
217 | */ |
218 | static const __u8 v[] = { |
219 | /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118, |
220 | /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156, |
221 | /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179, |
222 | /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199, |
223 | /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215, |
224 | /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229, |
225 | /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242, |
226 | /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254, |
227 | }; |
228 | |
229 | /* calculate the cubic root of x using a table lookup followed by one |
230 | * Newton-Raphson iteration. |
231 | * Avg err ~= 0.195% |
232 | */ |
233 | static __always_inline __u32 cubic_root(__u64 a) |
234 | { |
235 | __u32 x, b, shift; |
236 | |
237 | if (a < 64) { |
238 | /* a in [0..63] */ |
239 | return ((__u32)v[(__u32)a] + 35) >> 6; |
240 | } |
241 | |
242 | b = fls64(a); |
243 | b = ((b * 84) >> 8) - 1; |
244 | shift = (a >> (b * 3)); |
245 | |
246 | /* it is needed for verifier's bound check on v */ |
247 | if (shift >= 64) |
248 | return 0; |
249 | |
250 | x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6; |
251 | |
252 | /* |
253 | * Newton-Raphson iteration |
254 | * 2 |
255 | * x = ( 2 * x + a / x ) / 3 |
256 | * k+1 k k |
257 | */ |
258 | x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1))); |
259 | x = ((x * 341) >> 10); |
260 | return x; |
261 | } |
262 | |
263 | /* |
264 | * Compute congestion window to use. |
265 | */ |
266 | static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd, |
267 | __u32 acked) |
268 | { |
269 | __u32 delta, bic_target, max_cnt; |
270 | __u64 offs, t; |
271 | |
272 | ca->ack_cnt += acked; /* count the number of ACKed packets */ |
273 | |
274 | if (ca->last_cwnd == cwnd && |
275 | (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) |
276 | return; |
277 | |
278 | /* The CUBIC function can update ca->cnt at most once per jiffy. |
279 | * On all cwnd reduction events, ca->epoch_start is set to 0, |
280 | * which will force a recalculation of ca->cnt. |
281 | */ |
282 | if (ca->epoch_start && tcp_jiffies32 == ca->last_time) |
283 | goto tcp_friendliness; |
284 | |
285 | ca->last_cwnd = cwnd; |
286 | ca->last_time = tcp_jiffies32; |
287 | |
288 | if (ca->epoch_start == 0) { |
289 | ca->epoch_start = tcp_jiffies32; /* record beginning */ |
290 | ca->ack_cnt = acked; /* start counting */ |
291 | ca->tcp_cwnd = cwnd; /* syn with cubic */ |
292 | |
293 | if (ca->last_max_cwnd <= cwnd) { |
294 | ca->bic_K = 0; |
295 | ca->bic_origin_point = cwnd; |
296 | } else { |
297 | /* Compute new K based on |
298 | * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) |
299 | */ |
300 | ca->bic_K = cubic_root(a: cube_factor |
301 | * (ca->last_max_cwnd - cwnd)); |
302 | ca->bic_origin_point = ca->last_max_cwnd; |
303 | } |
304 | } |
305 | |
306 | /* cubic function - calc*/ |
307 | /* calculate c * time^3 / rtt, |
308 | * while considering overflow in calculation of time^3 |
309 | * (so time^3 is done by using 64 bit) |
310 | * and without the support of division of 64bit numbers |
311 | * (so all divisions are done by using 32 bit) |
312 | * also NOTE the unit of those veriables |
313 | * time = (t - K) / 2^bictcp_HZ |
314 | * c = bic_scale >> 10 |
315 | * rtt = (srtt >> 3) / HZ |
316 | * !!! The following code does not have overflow problems, |
317 | * if the cwnd < 1 million packets !!! |
318 | */ |
319 | |
320 | t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY; |
321 | t += ca->delay_min; |
322 | /* change the unit from usec to bictcp_HZ */ |
323 | t <<= BICTCP_HZ; |
324 | t /= USEC_PER_SEC; |
325 | |
326 | if (t < ca->bic_K) /* t - K */ |
327 | offs = ca->bic_K - t; |
328 | else |
329 | offs = t - ca->bic_K; |
330 | |
331 | /* c/rtt * (t-K)^3 */ |
332 | delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); |
333 | if (t < ca->bic_K) /* below origin*/ |
334 | bic_target = ca->bic_origin_point - delta; |
335 | else /* above origin*/ |
336 | bic_target = ca->bic_origin_point + delta; |
337 | |
338 | /* cubic function - calc bictcp_cnt*/ |
339 | if (bic_target > cwnd) { |
340 | ca->cnt = cwnd / (bic_target - cwnd); |
341 | } else { |
342 | ca->cnt = 100 * cwnd; /* very small increment*/ |
343 | } |
344 | |
345 | /* |
346 | * The initial growth of cubic function may be too conservative |
347 | * when the available bandwidth is still unknown. |
348 | */ |
349 | if (ca->last_max_cwnd == 0 && ca->cnt > 20) |
350 | ca->cnt = 20; /* increase cwnd 5% per RTT */ |
351 | |
352 | tcp_friendliness: |
353 | /* TCP Friendly */ |
354 | if (tcp_friendliness) { |
355 | __u32 scale = beta_scale; |
356 | __u32 n; |
357 | |
358 | /* update tcp cwnd */ |
359 | delta = (cwnd * scale) >> 3; |
360 | if (ca->ack_cnt > delta && delta) { |
361 | n = ca->ack_cnt / delta; |
362 | ca->ack_cnt -= n * delta; |
363 | ca->tcp_cwnd += n; |
364 | } |
365 | |
366 | if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */ |
367 | delta = ca->tcp_cwnd - cwnd; |
368 | max_cnt = cwnd / delta; |
369 | if (ca->cnt > max_cnt) |
370 | ca->cnt = max_cnt; |
371 | } |
372 | } |
373 | |
374 | /* The maximum rate of cwnd increase CUBIC allows is 1 packet per |
375 | * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. |
376 | */ |
377 | ca->cnt = max(ca->cnt, 2U); |
378 | } |
379 | |
380 | /* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ |
381 | void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) |
382 | { |
383 | struct tcp_sock *tp = tcp_sk(sk); |
384 | struct bictcp *ca = inet_csk_ca(sk: sk); |
385 | |
386 | if (!tcp_is_cwnd_limited(sk)) |
387 | return; |
388 | |
389 | if (tcp_in_slow_start(tp)) { |
390 | if (hystart && after(ack, ca->end_seq)) |
391 | bictcp_hystart_reset(sk: sk); |
392 | acked = tcp_slow_start(tp, acked); |
393 | if (!acked) |
394 | return; |
395 | } |
396 | bictcp_update(ca, cwnd: tp->snd_cwnd, acked: acked); |
397 | tcp_cong_avoid_ai(tp, ca->cnt, acked); |
398 | } |
399 | |
400 | __u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) |
401 | { |
402 | const struct tcp_sock *tp = tcp_sk(sk); |
403 | struct bictcp *ca = inet_csk_ca(sk: sk); |
404 | |
405 | ca->epoch_start = 0; /* end of epoch */ |
406 | |
407 | /* Wmax and fast convergence */ |
408 | if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) |
409 | ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) |
410 | / (2 * BICTCP_BETA_SCALE); |
411 | else |
412 | ca->last_max_cwnd = tp->snd_cwnd; |
413 | |
414 | return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); |
415 | } |
416 | |
417 | void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) |
418 | { |
419 | if (new_state == TCP_CA_Loss) { |
420 | bictcp_reset(ca: inet_csk_ca(sk: sk)); |
421 | bictcp_hystart_reset(sk: sk); |
422 | } |
423 | } |
424 | |
425 | #define GSO_MAX_SIZE 65536 |
426 | |
427 | /* Account for TSO/GRO delays. |
428 | * Otherwise short RTT flows could get too small ssthresh, since during |
429 | * slow start we begin with small TSO packets and ca->delay_min would |
430 | * not account for long aggregation delay when TSO packets get bigger. |
431 | * Ideally even with a very small RTT we would like to have at least one |
432 | * TSO packet being sent and received by GRO, and another one in qdisc layer. |
433 | * We apply another 100% factor because @rate is doubled at this point. |
434 | * We cap the cushion to 1ms. |
435 | */ |
436 | static __always_inline __u32 hystart_ack_delay(struct sock *sk) |
437 | { |
438 | unsigned long rate; |
439 | |
440 | rate = sk->sk_pacing_rate; |
441 | if (!rate) |
442 | return 0; |
443 | return min((__u64)USEC_PER_MSEC, |
444 | div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); |
445 | } |
446 | |
447 | static __always_inline void hystart_update(struct sock *sk, __u32 delay) |
448 | { |
449 | struct tcp_sock *tp = tcp_sk(sk); |
450 | struct bictcp *ca = inet_csk_ca(sk); |
451 | __u32 threshold; |
452 | |
453 | if (hystart_detect & HYSTART_ACK_TRAIN) { |
454 | __u32 now = bictcp_clock_us(sk); |
455 | |
456 | /* first detection parameter - ack-train detection */ |
457 | if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) { |
458 | ca->last_ack = now; |
459 | |
460 | threshold = ca->delay_min + hystart_ack_delay(sk); |
461 | |
462 | /* Hystart ack train triggers if we get ack past |
463 | * ca->delay_min/2. |
464 | * Pacing might have delayed packets up to RTT/2 |
465 | * during slow start. |
466 | */ |
467 | if (sk->sk_pacing_status == SK_PACING_NONE) |
468 | threshold >>= 1; |
469 | |
470 | if ((__s32)(now - ca->round_start) > threshold) { |
471 | ca->found = 1; |
472 | tp->snd_ssthresh = tp->snd_cwnd; |
473 | } |
474 | } |
475 | } |
476 | |
477 | if (hystart_detect & HYSTART_DELAY) { |
478 | /* obtain the minimum delay of more than sampling packets */ |
479 | if (ca->curr_rtt > delay) |
480 | ca->curr_rtt = delay; |
481 | if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { |
482 | ca->sample_cnt++; |
483 | } else { |
484 | if (ca->curr_rtt > ca->delay_min + |
485 | HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { |
486 | ca->found = 1; |
487 | tp->snd_ssthresh = tp->snd_cwnd; |
488 | } |
489 | } |
490 | } |
491 | } |
492 | |
493 | int bpf_cubic_acked_called = 0; |
494 | |
495 | void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, |
496 | const struct ack_sample *sample) |
497 | { |
498 | const struct tcp_sock *tp = tcp_sk(sk); |
499 | struct bictcp *ca = inet_csk_ca(sk: sk); |
500 | __u32 delay; |
501 | |
502 | bpf_cubic_acked_called = 1; |
503 | /* Some calls are for duplicates without timetamps */ |
504 | if (sample->rtt_us < 0) |
505 | return; |
506 | |
507 | /* Discard delay samples right after fast recovery */ |
508 | if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ) |
509 | return; |
510 | |
511 | delay = sample->rtt_us; |
512 | if (delay == 0) |
513 | delay = 1; |
514 | |
515 | /* first time call or link delay decreases */ |
516 | if (ca->delay_min == 0 || ca->delay_min > delay) |
517 | ca->delay_min = delay; |
518 | |
519 | /* hystart triggers when cwnd is larger than some threshold */ |
520 | if (!ca->found && tcp_in_slow_start(tp) && hystart && |
521 | tp->snd_cwnd >= hystart_low_window) |
522 | hystart_update(sk: sk, delay); |
523 | } |
524 | |
525 | extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; |
526 | |
527 | __u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) |
528 | { |
529 | return tcp_reno_undo_cwnd(sk); |
530 | } |
531 | |
532 | SEC(".struct_ops" ) |
533 | struct tcp_congestion_ops cubic = { |
534 | .init = (void *)bpf_cubic_init, |
535 | .ssthresh = (void *)bpf_cubic_recalc_ssthresh, |
536 | .cong_avoid = (void *)bpf_cubic_cong_avoid, |
537 | .set_state = (void *)bpf_cubic_state, |
538 | .undo_cwnd = (void *)bpf_cubic_undo_cwnd, |
539 | .cwnd_event = (void *)bpf_cubic_cwnd_event, |
540 | .pkts_acked = (void *)bpf_cubic_acked, |
541 | .name = "bpf_cubic" , |
542 | }; |
543 | |