bpf_cubic.c source code [linux/tools/testing/selftests/bpf/progs/bpf_cubic.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2
3	/ WARNING: This implemenation is not necessarily the same*
4	* as the tcp_cubic.c. The purpose is mainly for testing
5	* the kernel BPF logic.
6	*
7	* Highlights:
8	* 1. CONFIG_HZ .kconfig map is used.
9	* 2. In bictcp_update(), calculation is changed to use usec
10	* resolution (i.e. USEC_PER_JIFFY) instead of using jiffies.
11	* Thus, usecs_to_jiffies() is not used in the bpf_cubic.c.
12	* 3. In bitctcp_update() [under tcp_friendliness], the original
13	* "while (ca->ack_cnt > delta)" loop is changed to the equivalent
14	* "ca->ack_cnt / delta" operation.
15	*/
16
17	#include <linux/bpf.h>
18	#include <linux/stddef.h>
19	#include <linux/tcp.h>
20	#include "bpf_tcp_helpers.h"
21
22	char _license[] SEC("license") = "GPL";
23
24	#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
25
26	#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
27	* max_cwnd = snd_cwnd * beta
28	*/
29	#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
30
31	/ Two methods of hybrid slow start /
32	#define HYSTART_ACK_TRAIN 0x1
33	#define HYSTART_DELAY 0x2
34
35	/ Number of delay samples for detecting the increase of delay /
36	#define HYSTART_MIN_SAMPLES 8
37	#define HYSTART_DELAY_MIN (4000U) /* 4ms */
38	#define HYSTART_DELAY_MAX (16000U) /* 16 ms */
39	#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
40
41	static int fast_convergence = `1`;
42	static const int beta = `717`; / = 717/1024 (BICTCP_BETA_SCALE) /
43	static int initial_ssthresh;
44	static const int bic_scale = `41`;
45	static int tcp_friendliness = `1`;
46
47	static int hystart = `1`;
48	static int hystart_detect = HYSTART_ACK_TRAIN \| HYSTART_DELAY;
49	static int hystart_low_window = `16`;
50	static int hystart_ack_delta_us = `2000`;
51
52	static const __u32 cube_rtt_scale = (bic_scale * `10`); / 1024c/rtt /*
53	static const __u32 beta_scale = `8`*(BICTCP_BETA_SCALE+beta) / `3`
54	/ (BICTCP_BETA_SCALE - beta);
55	/ calculate the "K" for (wmax-cwnd) = c/rtt * K^3*
56	* so K = cubic_root( (wmax-cwnd)*rtt/c )
57	* the unit of K is bictcp_HZ=2^10, not HZ
58	*
59	* c = bic_scale >> 10
60	* rtt = 100ms
61	*
62	* the following code has been designed and tested for
63	* cwnd < 1 million packets
64	* RTT < 100 seconds
65	* HZ < 1,000,00 (corresponding to 10 nano-second)
66	*/
67
68	/ 1/c * 2^2bictcp_HZ srtt, 2^40 /
69	static const __u64 cube_factor = (__u64)(`1ull` << (`10`+`3`*BICTCP_HZ))
70	/ (bic_scale * `10`);
71
72	/ BIC TCP Parameters /
73	struct bictcp {
74	__u32 cnt; / increase cwnd by 1 after ACKs /
75	__u32 last_max_cwnd; / last maximum snd_cwnd /
76	__u32 last_cwnd; / the last snd_cwnd /
77	__u32 last_time; / time when updated last_cwnd /
78	__u32 bic_origin_point;/ origin point of bic function /
79	__u32 bic_K; / time to origin point*
80	from the beginning of the current epoch /*
81	__u32 delay_min; / min delay (usec) /
82	__u32 epoch_start; / beginning of an epoch /
83	__u32 ack_cnt; / number of acks /
84	__u32 tcp_cwnd; / estimated tcp cwnd /
85	__u16 unused;
86	__u8 sample_cnt; / number of samples to decide curr_rtt /
87	__u8 found; / the exit point is found? /
88	__u32 round_start; / beginning of each round /
89	__u32 end_seq; / end_seq of the round /
90	__u32 last_ack; / last time when the ACK spacing is close /
91	__u32 curr_rtt; / the minimum rtt of current round /
92	};
93
94	static inline void bictcp_reset(struct bictcp *ca)
95	{
96	ca->cnt = `0`;
97	ca->last_max_cwnd = `0`;
98	ca->last_cwnd = `0`;
99	ca->last_time = `0`;
100	ca->bic_origin_point = `0`;
101	ca->bic_K = `0`;
102	ca->delay_min = `0`;
103	ca->epoch_start = `0`;
104	ca->ack_cnt = `0`;
105	ca->tcp_cwnd = `0`;
106	ca->found = `0`;
107	}
108
109	extern unsigned long CONFIG_HZ __kconfig;
110	#define HZ CONFIG_HZ
111	#define USEC_PER_MSEC 1000UL
112	#define USEC_PER_SEC 1000000UL
113	#define USEC_PER_JIFFY (USEC_PER_SEC / HZ)
114
115	static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor)
116	{
117	return dividend / divisor;
118	}
119
120	#define div64_ul div64_u64
121
122	#define BITS_PER_U64 (sizeof(__u64) * 8)
123	static __always_inline int fls64(__u64 x)
124	{
125	int num = BITS_PER_U64 - `1`;
126
127	if (x == `0`)
128	return `0`;
129
130	if (!(x & (~`0ull` << (BITS_PER_U64-`32`)))) {
131	num -= `32`;
132	x <<= `32`;
133	}
134	if (!(x & (~`0ull` << (BITS_PER_U64-`16`)))) {
135	num -= `16`;
136	x <<= `16`;
137	}
138	if (!(x & (~`0ull` << (BITS_PER_U64-`8`)))) {
139	num -= `8`;
140	x <<= `8`;
141	}
142	if (!(x & (~`0ull` << (BITS_PER_U64-`4`)))) {
143	num -= `4`;
144	x <<= `4`;
145	}
146	if (!(x & (~`0ull` << (BITS_PER_U64-`2`)))) {
147	num -= `2`;
148	x <<= `2`;
149	}
150	if (!(x & (~`0ull` << (BITS_PER_U64-`1`))))
151	num -= `1`;
152
153	return num + `1`;
154	}
155
156	static __always_inline __u32 bictcp_clock_us(const struct sock *sk)
157	{
158	return tcp_sk(sk)->tcp_mstamp;
159	}
160
161	static __always_inline void bictcp_hystart_reset(struct sock *sk)
162	{
163	struct tcp_sock *tp = tcp_sk(sk);
164	struct bictcp *ca = inet_csk_ca(sk);
165
166	ca->round_start = ca->last_ack = bictcp_clock_us(sk);
167	ca->end_seq = tp->snd_nxt;
168	ca->curr_rtt = ~`0U`;
169	ca->sample_cnt = `0`;
170	}
171
172	/ "struct_ops/" prefix is a requirement /
173	SEC("struct_ops/bpf_cubic_init")
174	void BPF_PROG(bpf_cubic_init, struct sock *sk)
175	{
176	struct bictcp *ca = inet_csk_ca(sk: sk);
177
178	bictcp_reset(ca);
179
180	if (hystart)
181	bictcp_hystart_reset(sk: sk);
182
183	if (!hystart && initial_ssthresh)
184	tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
185	}
186
187	/ "struct_ops" prefix is a requirement /
188	SEC("struct_ops/bpf_cubic_cwnd_event")
189	void BPF_PROG(bpf_cubic_cwnd_event, struct sock sk, enum* tcp_ca_event event)
190	{
191	if (event == CA_EVENT_TX_START) {
192	struct bictcp *ca = inet_csk_ca(sk: sk);
193	__u32 now = tcp_jiffies32;
194	__s32 delta;
195
196	delta = now - tcp_sk(sk)->lsndtime;
197
198	/ We were application limited (idle) for a while.*
199	* Shift epoch_start to keep cwnd growth to cubic curve.
200	*/
201	if (ca->epoch_start && delta > `0`) {
202	ca->epoch_start += delta;
203	if (after(ca->epoch_start, now))
204	ca->epoch_start = now;
205	}
206	return;
207	}
208	}
209
210	/*
211	* cbrt(x) MSB values for x MSB values in [0..63].
212	* Precomputed then refined by hand - Willy Tarreau
213	*
214	* For x in [0..63],
215	* v = cbrt(x << 18) - 1
216	* cbrt(x) = (v[x] + 10) >> 6
217	*/
218	static const __u8 v[] = {
219	/ 0x00 / `0`, `54`, `54`, `54`, `118`, `118`, `118`, `118`,
220	/ 0x08 / `123`, `129`, `134`, `138`, `143`, `147`, `151`, `156`,
221	/ 0x10 / `157`, `161`, `164`, `168`, `170`, `173`, `176`, `179`,
222	/ 0x18 / `181`, `185`, `187`, `190`, `192`, `194`, `197`, `199`,
223	/ 0x20 / `200`, `202`, `204`, `206`, `209`, `211`, `213`, `215`,
224	/ 0x28 / `217`, `219`, `221`, `222`, `224`, `225`, `227`, `229`,
225	/ 0x30 / `231`, `232`, `234`, `236`, `237`, `239`, `240`, `242`,
226	/ 0x38 / `244`, `245`, `246`, `248`, `250`, `251`, `252`, `254`,
227	};
228
229	/ calculate the cubic root of x using a table lookup followed by one*
230	* Newton-Raphson iteration.
231	* Avg err ~= 0.195%
232	*/
233	static __always_inline __u32 cubic_root(__u64 a)
234	{
235	__u32 x, b, shift;
236
237	if (a < `64`) {
238	/ a in [0..63] /
239	return ((__u32)v[(__u32)a] + `35`) >> `6`;
240	}
241
242	b = fls64(a);
243	b = ((b * `84`) >> `8`) - `1`;
244	shift = (a >> (b * `3`));
245
246	/ it is needed for verifier's bound check on v /
247	if (shift >= `64`)
248	return `0`;
249
250	x = ((__u32)(((__u32)v[shift] + `10`) << b)) >> `6`;
251
252	/*
253	* Newton-Raphson iteration
254	* 2
255	* x = ( 2 * x + a / x ) / 3
256	* k+1 k k
257	*/
258	x = (`2` * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - `1`)));
259	x = ((x * `341`) >> `10`);
260	return x;
261	}
262
263	/*
264	* Compute congestion window to use.
265	*/
266	static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd,
267	__u32 acked)
268	{
269	__u32 delta, bic_target, max_cnt;
270	__u64 offs, t;
271
272	ca->ack_cnt += acked; / count the number of ACKed packets /
273
274	if (ca->last_cwnd == cwnd &&
275	(__s32)(tcp_jiffies32 - ca->last_time) <= HZ / `32`)
276	return;
277
278	/ The CUBIC function can update ca->cnt at most once per jiffy.*
279	* On all cwnd reduction events, ca->epoch_start is set to 0,
280	* which will force a recalculation of ca->cnt.
281	*/
282	if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
283	goto tcp_friendliness;
284
285	ca->last_cwnd = cwnd;
286	ca->last_time = tcp_jiffies32;
287
288	if (ca->epoch_start == `0`) {
289	ca->epoch_start = tcp_jiffies32; / record beginning /
290	ca->ack_cnt = acked; / start counting /
291	ca->tcp_cwnd = cwnd; / syn with cubic /
292
293	if (ca->last_max_cwnd <= cwnd) {
294	ca->bic_K = `0`;
295	ca->bic_origin_point = cwnd;
296	} else {
297	/ Compute new K based on*
298	* (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
299	*/
300	ca->bic_K = cubic_root(a: cube_factor
301	* (ca->last_max_cwnd - cwnd));
302	ca->bic_origin_point = ca->last_max_cwnd;
303	}
304	}
305
306	/ cubic function - calc/
307	/ calculate c * time^3 / rtt,*
308	* while considering overflow in calculation of time^3
309	* (so time^3 is done by using 64 bit)
310	* and without the support of division of 64bit numbers
311	* (so all divisions are done by using 32 bit)
312	* also NOTE the unit of those veriables
313	* time = (t - K) / 2^bictcp_HZ
314	* c = bic_scale >> 10
315	* rtt = (srtt >> 3) / HZ
316	* !!! The following code does not have overflow problems,
317	* if the cwnd < 1 million packets !!!
318	*/
319
320	t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY;
321	t += ca->delay_min;
322	/ change the unit from usec to bictcp_HZ /
323	t <<= BICTCP_HZ;
324	t /= USEC_PER_SEC;
325
326	if (t < ca->bic_K) / t - K /
327	offs = ca->bic_K - t;
328	else
329	offs = t - ca->bic_K;
330
331	/ c/rtt * (t-K)^3 /
332	delta = (cube_rtt_scale * offs * offs * offs) >> (`10`+`3`*BICTCP_HZ);
333	if (t < ca->bic_K) / below origin/
334	bic_target = ca->bic_origin_point - delta;
335	else / above origin/
336	bic_target = ca->bic_origin_point + delta;
337
338	/ cubic function - calc bictcp_cnt/
339	if (bic_target > cwnd) {
340	ca->cnt = cwnd / (bic_target - cwnd);
341	} else {
342	ca->cnt = `100` * cwnd; / very small increment/
343	}
344
345	/*
346	* The initial growth of cubic function may be too conservative
347	* when the available bandwidth is still unknown.
348	*/
349	if (ca->last_max_cwnd == `0` && ca->cnt > `20`)
350	ca->cnt = `20`; / increase cwnd 5% per RTT /
351
352	tcp_friendliness:
353	/ TCP Friendly /
354	if (tcp_friendliness) {
355	__u32 scale = beta_scale;
356	__u32 n;
357
358	/ update tcp cwnd /
359	delta = (cwnd * scale) >> `3`;
360	if (ca->ack_cnt > delta && delta) {
361	n = ca->ack_cnt / delta;
362	ca->ack_cnt -= n * delta;
363	ca->tcp_cwnd += n;
364	}
365
366	if (ca->tcp_cwnd > cwnd) { / if bic is slower than tcp /
367	delta = ca->tcp_cwnd - cwnd;
368	max_cnt = cwnd / delta;
369	if (ca->cnt > max_cnt)
370	ca->cnt = max_cnt;
371	}
372	}
373
374	/ The maximum rate of cwnd increase CUBIC allows is 1 packet per*
375	* 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
376	*/
377	ca->cnt = max(ca->cnt, `2U`);
378	}
379
380	/ Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. /
381	void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
382	{
383	struct tcp_sock *tp = tcp_sk(sk);
384	struct bictcp *ca = inet_csk_ca(sk: sk);
385
386	if (!tcp_is_cwnd_limited(sk))
387	return;
388
389	if (tcp_in_slow_start(tp)) {
390	if (hystart && after(ack, ca->end_seq))
391	bictcp_hystart_reset(sk: sk);
392	acked = tcp_slow_start(tp, acked);
393	if (!acked)
394	return;
395	}
396	bictcp_update(ca, cwnd: tp->snd_cwnd, acked: acked);
397	tcp_cong_avoid_ai(tp, ca->cnt, acked);
398	}
399
400	__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
401	{
402	const struct tcp_sock *tp = tcp_sk(sk);
403	struct bictcp *ca = inet_csk_ca(sk: sk);
404
405	ca->epoch_start = `0`; / end of epoch /
406
407	/ Wmax and fast convergence /
408	if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
409	ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
410	/ (`2` * BICTCP_BETA_SCALE);
411	else
412	ca->last_max_cwnd = tp->snd_cwnd;
413
414	return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, `2U`);
415	}
416
417	void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
418	{
419	if (new_state == TCP_CA_Loss) {
420	bictcp_reset(ca: inet_csk_ca(sk: sk));
421	bictcp_hystart_reset(sk: sk);
422	}
423	}
424
425	#define GSO_MAX_SIZE 65536
426
427	/ Account for TSO/GRO delays.*
428	* Otherwise short RTT flows could get too small ssthresh, since during
429	* slow start we begin with small TSO packets and ca->delay_min would
430	* not account for long aggregation delay when TSO packets get bigger.
431	* Ideally even with a very small RTT we would like to have at least one
432	* TSO packet being sent and received by GRO, and another one in qdisc layer.
433	* We apply another 100% factor because @rate is doubled at this point.
434	* We cap the cushion to 1ms.
435	*/
436	static __always_inline __u32 hystart_ack_delay(struct sock *sk)
437	{
438	unsigned long rate;
439
440	rate = sk->sk_pacing_rate;
441	if (!rate)
442	return `0`;
443	return min((__u64)USEC_PER_MSEC,
444	div64_ul((__u64)GSO_MAX_SIZE * `4` * USEC_PER_SEC, rate));
445	}
446
447	static __always_inline void hystart_update(struct sock *sk, __u32 delay)
448	{
449	struct tcp_sock *tp = tcp_sk(sk);
450	struct bictcp *ca = inet_csk_ca(sk);
451	__u32 threshold;
452
453	if (hystart_detect & HYSTART_ACK_TRAIN) {
454	__u32 now = bictcp_clock_us(sk);
455
456	/ first detection parameter - ack-train detection /
457	if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
458	ca->last_ack = now;
459
460	threshold = ca->delay_min + hystart_ack_delay(sk);
461
462	/ Hystart ack train triggers if we get ack past*
463	* ca->delay_min/2.
464	* Pacing might have delayed packets up to RTT/2
465	* during slow start.
466	*/
467	if (sk->sk_pacing_status == SK_PACING_NONE)
468	threshold >>= `1`;
469
470	if ((__s32)(now - ca->round_start) > threshold) {
471	ca->found = `1`;
472	tp->snd_ssthresh = tp->snd_cwnd;
473	}
474	}
475	}
476
477	if (hystart_detect & HYSTART_DELAY) {
478	/ obtain the minimum delay of more than sampling packets /
479	if (ca->curr_rtt > delay)
480	ca->curr_rtt = delay;
481	if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
482	ca->sample_cnt++;
483	} else {
484	if (ca->curr_rtt > ca->delay_min +
485	HYSTART_DELAY_THRESH(ca->delay_min >> `3`)) {
486	ca->found = `1`;
487	tp->snd_ssthresh = tp->snd_cwnd;
488	}
489	}
490	}
491	}
492
493	int bpf_cubic_acked_called = `0`;
494
495	void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
496	const struct ack_sample *sample)
497	{
498	const struct tcp_sock *tp = tcp_sk(sk);
499	struct bictcp *ca = inet_csk_ca(sk: sk);
500	__u32 delay;
501
502	bpf_cubic_acked_called = `1`;
503	/ Some calls are for duplicates without timetamps /
504	if (sample->rtt_us < `0`)
505	return;
506
507	/ Discard delay samples right after fast recovery /
508	if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
509	return;
510
511	delay = sample->rtt_us;
512	if (delay == `0`)
513	delay = `1`;
514
515	/ first time call or link delay decreases /
516	if (ca->delay_min == `0` \|\| ca->delay_min > delay)
517	ca->delay_min = delay;
518
519	/ hystart triggers when cwnd is larger than some threshold /
520	if (!ca->found && tcp_in_slow_start(tp) && hystart &&
521	tp->snd_cwnd >= hystart_low_window)
522	hystart_update(sk: sk, delay);
523	}
524
525	extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
526
527	__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
528	{
529	return tcp_reno_undo_cwnd(sk);
530	}
531
532	SEC(".struct_ops")
533	struct tcp_congestion_ops cubic = {
534	.init = (void *)bpf_cubic_init,
535	.ssthresh = (void *)bpf_cubic_recalc_ssthresh,
536	.cong_avoid = (void *)bpf_cubic_cong_avoid,
537	.set_state = (void *)bpf_cubic_state,
538	.undo_cwnd = (void *)bpf_cubic_undo_cwnd,
539	.cwnd_event = (void *)bpf_cubic_cwnd_event,
540	.pkts_acked = (void *)bpf_cubic_acked,
541	.name = "bpf_cubic",
542	};
543

source code of linux/tools/testing/selftests/bpf/progs/bpf_cubic.c