tcp_nv.c source code [linux/net/ipv4/tcp_nv.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* TCP NV: TCP with Congestion Avoidance
4	*
5	* TCP-NV is a successor of TCP-Vegas that has been developed to
6	* deal with the issues that occur in modern networks.
7	* Like TCP-Vegas, TCP-NV supports true congestion avoidance,
8	* the ability to detect congestion before packet losses occur.
9	* When congestion (queue buildup) starts to occur, TCP-NV
10	* predicts what the cwnd size should be for the current
11	* throughput and it reduces the cwnd proportionally to
12	* the difference between the current cwnd and the predicted cwnd.
13	*
14	* NV is only recommeneded for traffic within a data center, and when
15	* all the flows are NV (at least those within the data center). This
16	* is due to the inherent unfairness between flows using losses to
17	* detect congestion (congestion control) and those that use queue
18	* buildup to detect congestion (congestion avoidance).
19	*
20	* Note: High NIC coalescence values may lower the performance of NV
21	* due to the increased noise in RTT values. In particular, we have
22	* seen issues with rx-frames values greater than 8.
23	*
24	* TODO:
25	* 1) Add mechanism to deal with reverse congestion.
26	*/
27
28	#include <linux/module.h>
29	#include <linux/math64.h>
30	#include <net/tcp.h>
31	#include <linux/inet_diag.h>
32
33	/ TCP NV parameters*
34	*
35	* nv_pad Max number of queued packets allowed in network
36	* nv_pad_buffer Do not grow cwnd if this closed to nv_pad
37	* nv_reset_period How often (in) seconds)to reset min_rtt
38	* nv_min_cwnd Don't decrease cwnd below this if there are no losses
39	* nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected
40	* nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8
41	* nv_rtt_factor RTT averaging factor
42	* nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur
43	* nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd
44	* nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd
45	* nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping
46	* slow-start due to congestion
47	* nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion
48	* nv_rtt_min_cnt Wait these many RTTs before making congesion decision
49	* nv_cwnd_growth_rate_neg
50	* nv_cwnd_growth_rate_pos
51	* How quickly to double growth rate (not rate) of cwnd when not
52	* congested. One value (nv_cwnd_growth_rate_neg) for when
53	* rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos)
54	* otherwise.
55	*/
56
57	static int nv_pad __read_mostly = `10`;
58	static int nv_pad_buffer __read_mostly = `2`;
59	static int nv_reset_period __read_mostly = `5`; / in seconds /
60	static int nv_min_cwnd __read_mostly = `2`;
61	static int nv_cong_dec_mult __read_mostly = `30` * `128` / `100`; / = 30% /
62	static int nv_ssthresh_factor __read_mostly = `8`; / = 1 /
63	static int nv_rtt_factor __read_mostly = `128`; / = 1/2old + 1/2new /
64	static int nv_loss_dec_factor __read_mostly = `819`; / => 80% /
65	static int nv_cwnd_growth_rate_neg __read_mostly = `8`;
66	static int nv_cwnd_growth_rate_pos __read_mostly; / 0 => fixed like Reno /
67	static int nv_dec_eval_min_calls __read_mostly = `60`;
68	static int nv_inc_eval_min_calls __read_mostly = `20`;
69	static int nv_ssthresh_eval_min_calls __read_mostly = `30`;
70	static int nv_stop_rtt_cnt __read_mostly = `10`;
71	static int nv_rtt_min_cnt __read_mostly = `2`;
72
73	module_param(nv_pad, int, `0644`);
74	MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network");
75	module_param(nv_reset_period, int, `0644`);
76	MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
77	module_param(nv_min_cwnd, int, `0644`);
78	MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
79	" without losses");
80
81	/ TCP NV Parameters /
82	struct tcpnv {
83	unsigned long nv_min_rtt_reset_jiffies; / when to switch to*
84	* nv_min_rtt_new */
85	s8 cwnd_growth_factor; / Current cwnd growth factor,*
86	* < 0 => less than 1 packet/RTT */
87	u8 available8;
88	u16 available16;
89	u8 nv_allow_cwnd_growth:`1`, / whether cwnd can grow /
90	nv_reset:`1`, / whether to reset values /
91	nv_catchup:`1`; / whether we are growing because*
92	* of temporary cwnd decrease */
93	u8 nv_eval_call_cnt; / call count since last eval /
94	u8 nv_min_cwnd; / nv won't make a ca decision if cwnd is*
95	* smaller than this. It may grow to handle
96	* TSO, LRO and interrupt coalescence because
97	* with these a small cwnd cannot saturate
98	* the link. Note that this is different from
99	* the file local nv_min_cwnd */
100	u8 nv_rtt_cnt; / RTTs without making ca decision /;
101	u32 nv_last_rtt; / last rtt /
102	u32 nv_min_rtt; / active min rtt. Used to determine slope /
103	u32 nv_min_rtt_new; / min rtt for future use /
104	u32 nv_base_rtt; / If non-zero it represents the threshold for*
105	* congestion */
106	u32 nv_lower_bound_rtt; / Used in conjunction with nv_base_rtt. It is*
107	* set to 80% of nv_base_rtt. It helps reduce
108	* unfairness between flows */
109	u32 nv_rtt_max_rate; / max rate seen during current RTT /
110	u32 nv_rtt_start_seq; / current RTT ends when packet arrives*
111	* acking beyond nv_rtt_start_seq */
112	u32 nv_last_snd_una; / Previous value of tp->snd_una. It is*
113	* used to determine bytes acked since last
114	* call to bictcp_acked */
115	u32 nv_no_cong_cnt; / Consecutive no congestion decisions /
116	};
117
118	#define NV_INIT_RTT U32_MAX
119	#define NV_MIN_CWND 4
120	#define NV_MIN_CWND_GROW 2
121	#define NV_TSO_CWND_BOUND 80
122
123	static inline void tcpnv_reset(struct tcpnv ca, struct* sock *sk)
124	{
125	struct tcp_sock *tp = tcp_sk(sk);
126
127	ca->nv_reset = `0`;
128	ca->nv_no_cong_cnt = `0`;
129	ca->nv_rtt_cnt = `0`;
130	ca->nv_last_rtt = `0`;
131	ca->nv_rtt_max_rate = `0`;
132	ca->nv_rtt_start_seq = tp->snd_una;
133	ca->nv_eval_call_cnt = `0`;
134	ca->nv_last_snd_una = tp->snd_una;
135	}
136
137	static void tcpnv_init(struct sock *sk)
138	{
139	struct tcpnv *ca = inet_csk_ca(sk);
140	int base_rtt;
141
142	tcpnv_reset(ca, sk);
143
144	/ See if base_rtt is available from socket_ops bpf program.*
145	* It is meant to be used in environments, such as communication
146	* within a datacenter, where we have reasonable estimates of
147	* RTTs
148	*/
149	base_rtt = tcp_call_bpf(sk, op: BPF_SOCK_OPS_BASE_RTT, nargs: `0`, NULL);
150	if (base_rtt > `0`) {
151	ca->nv_base_rtt = base_rtt;
152	ca->nv_lower_bound_rtt = (base_rtt * `205`) >> `8`; / 80% /
153	} else {
154	ca->nv_base_rtt = `0`;
155	ca->nv_lower_bound_rtt = `0`;
156	}
157
158	ca->nv_allow_cwnd_growth = `1`;
159	ca->nv_min_rtt_reset_jiffies = jiffies + `2` * HZ;
160	ca->nv_min_rtt = NV_INIT_RTT;
161	ca->nv_min_rtt_new = NV_INIT_RTT;
162	ca->nv_min_cwnd = NV_MIN_CWND;
163	ca->nv_catchup = `0`;
164	ca->cwnd_growth_factor = `0`;
165	}
166
167	/ If provided, apply upper (base_rtt) and lower (lower_bound_rtt)*
168	* bounds to RTT.
169	*/
170	inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val)
171	{
172	if (ca->nv_lower_bound_rtt > `0` && val < ca->nv_lower_bound_rtt)
173	return ca->nv_lower_bound_rtt;
174	else if (ca->nv_base_rtt > `0` && val > ca->nv_base_rtt)
175	return ca->nv_base_rtt;
176	else
177	return val;
178	}
179
180	static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
181	{
182	struct tcp_sock *tp = tcp_sk(sk);
183	struct tcpnv *ca = inet_csk_ca(sk);
184	u32 cnt;
185
186	if (!tcp_is_cwnd_limited(sk))
187	return;
188
189	/ Only grow cwnd if NV has not detected congestion /
190	if (!ca->nv_allow_cwnd_growth)
191	return;
192
193	if (tcp_in_slow_start(tp)) {
194	acked = tcp_slow_start(tp, acked);
195	if (!acked)
196	return;
197	}
198
199	if (ca->cwnd_growth_factor < `0`) {
200	cnt = tcp_snd_cwnd(tp) << -ca->cwnd_growth_factor;
201	tcp_cong_avoid_ai(tp, w: cnt, acked);
202	} else {
203	cnt = max(`4U`, tcp_snd_cwnd(tp) >> ca->cwnd_growth_factor);
204	tcp_cong_avoid_ai(tp, w: cnt, acked);
205	}
206	}
207
208	static u32 tcpnv_recalc_ssthresh(struct sock *sk)
209	{
210	const struct tcp_sock *tp = tcp_sk(sk);
211
212	return max((tcp_snd_cwnd(tp) * nv_loss_dec_factor) >> `10`, `2U`);
213	}
214
215	static void tcpnv_state(struct sock *sk, u8 new_state)
216	{
217	struct tcpnv *ca = inet_csk_ca(sk);
218
219	if (new_state == TCP_CA_Open && ca->nv_reset) {
220	tcpnv_reset(ca, sk);
221	} else if (new_state == TCP_CA_Loss \|\| new_state == TCP_CA_CWR \|\|
222	new_state == TCP_CA_Recovery) {
223	ca->nv_reset = `1`;
224	ca->nv_allow_cwnd_growth = `0`;
225	if (new_state == TCP_CA_Loss) {
226	/ Reset cwnd growth factor to Reno value /
227	if (ca->cwnd_growth_factor > `0`)
228	ca->cwnd_growth_factor = `0`;
229	/ Decrease growth rate if allowed /
230	if (nv_cwnd_growth_rate_neg > `0` &&
231	ca->cwnd_growth_factor > -`8`)
232	ca->cwnd_growth_factor--;
233	}
234	}
235	}
236
237	/ Do congestion avoidance calculations for TCP-NV*
238	*/
239	static void tcpnv_acked(struct sock sk, const* struct ack_sample *sample)
240	{
241	const struct inet_connection_sock *icsk = inet_csk(sk);
242	struct tcp_sock *tp = tcp_sk(sk);
243	struct tcpnv *ca = inet_csk_ca(sk);
244	unsigned long now = jiffies;
245	u64 rate64;
246	u32 rate, max_win, cwnd_by_slope;
247	u32 avg_rtt;
248	u32 bytes_acked = `0`;
249
250	/ Some calls are for duplicates without timetamps /
251	if (sample->rtt_us < `0`)
252	return;
253
254	/ If not in TCP_CA_Open or TCP_CA_Disorder states, skip. /
255	if (icsk->icsk_ca_state != TCP_CA_Open &&
256	icsk->icsk_ca_state != TCP_CA_Disorder)
257	return;
258
259	/ Stop cwnd growth if we were in catch up mode /
260	if (ca->nv_catchup && tcp_snd_cwnd(tp) >= nv_min_cwnd) {
261	ca->nv_catchup = `0`;
262	ca->nv_allow_cwnd_growth = `0`;
263	}
264
265	bytes_acked = tp->snd_una - ca->nv_last_snd_una;
266	ca->nv_last_snd_una = tp->snd_una;
267
268	if (sample->in_flight == `0`)
269	return;
270
271	/ Calculate moving average of RTT /
272	if (nv_rtt_factor > `0`) {
273	if (ca->nv_last_rtt > `0`) {
274	avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor +
275	((u64)ca->nv_last_rtt)
276	* (`256` - nv_rtt_factor)) >> `8`;
277	} else {
278	avg_rtt = sample->rtt_us;
279	ca->nv_min_rtt = avg_rtt << `1`;
280	}
281	ca->nv_last_rtt = avg_rtt;
282	} else {
283	avg_rtt = sample->rtt_us;
284	}
285
286	/ rate in 100's bits per second /
287	rate64 = ((u64)sample->in_flight) * `80000`;
288	do_div(rate64, avg_rtt ?: `1`);
289	rate = (u32)rate64;
290
291	/ Remember the maximum rate seen during this RTT*
292	* Note: It may be more than one RTT. This function should be
293	* called at least nv_dec_eval_min_calls times.
294	*/
295	if (ca->nv_rtt_max_rate < rate)
296	ca->nv_rtt_max_rate = rate;
297
298	/ We have valid information, increment counter /
299	if (ca->nv_eval_call_cnt < `255`)
300	ca->nv_eval_call_cnt++;
301
302	/ Apply bounds to rtt. Only used to update min_rtt /
303	avg_rtt = nv_get_bounded_rtt(ca, val: avg_rtt);
304
305	/ update min rtt if necessary /
306	if (avg_rtt < ca->nv_min_rtt)
307	ca->nv_min_rtt = avg_rtt;
308
309	/ update future min_rtt if necessary /
310	if (avg_rtt < ca->nv_min_rtt_new)
311	ca->nv_min_rtt_new = avg_rtt;
312
313	/ nv_min_rtt is updated with the minimum (possibley averaged) rtt*
314	* seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
315	* warm reset). This new nv_min_rtt will be continued to be updated
316	* and be used for another sysctl_tcp_nv_reset_period seconds,
317	* when it will be updated again.
318	* In practice we introduce some randomness, so the actual period used
319	* is chosen randomly from the range:
320	* [sysctl_tcp_nv_reset_period3/4, sysctl_tcp_nv_reset_period5/4)
321	*/
322	if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
323	unsigned char rand;
324
325	ca->nv_min_rtt = ca->nv_min_rtt_new;
326	ca->nv_min_rtt_new = NV_INIT_RTT;
327	get_random_bytes(buf: &rand, len: `1`);
328	ca->nv_min_rtt_reset_jiffies =
329	now + ((nv_reset_period * (`384` + rand) * HZ) >> `9`);
330	/ Every so often we decrease ca->nv_min_cwnd in case previous*
331	* value is no longer accurate.
332	*/
333	ca->nv_min_cwnd = max(ca->nv_min_cwnd / `2`, NV_MIN_CWND);
334	}
335
336	/ Once per RTT check if we need to do congestion avoidance /
337	if (before(seq1: ca->nv_rtt_start_seq, seq2: tp->snd_una)) {
338	ca->nv_rtt_start_seq = tp->snd_nxt;
339	if (ca->nv_rtt_cnt < `0xff`)
340	/ Increase counter for RTTs without CA decision /
341	ca->nv_rtt_cnt++;
342
343	/ If this function is only called once within an RTT*
344	* the cwnd is probably too small (in some cases due to
345	* tso, lro or interrupt coalescence), so we increase
346	* ca->nv_min_cwnd.
347	*/
348	if (ca->nv_eval_call_cnt == `1` &&
349	bytes_acked >= (ca->nv_min_cwnd - `1`) * tp->mss_cache &&
350	ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + `1`)) {
351	ca->nv_min_cwnd = min(ca->nv_min_cwnd
352	+ NV_MIN_CWND_GROW,
353	NV_TSO_CWND_BOUND + `1`);
354	ca->nv_rtt_start_seq = tp->snd_nxt +
355	ca->nv_min_cwnd * tp->mss_cache;
356	ca->nv_eval_call_cnt = `0`;
357	ca->nv_allow_cwnd_growth = `1`;
358	return;
359	}
360
361	/ Find the ideal cwnd for current rate from slope*
362	* slope = 80000.0 * mss / nv_min_rtt
363	* cwnd_by_slope = nv_rtt_max_rate / slope
364	*/
365	cwnd_by_slope = (u32)
366	div64_u64(dividend: ((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
367	divisor: `80000ULL` * tp->mss_cache);
368	max_win = cwnd_by_slope + nv_pad;
369
370	/ If cwnd > max_win, decrease cwnd*
371	* if cwnd < max_win, grow cwnd
372	* else leave the same
373	*/
374	if (tcp_snd_cwnd(tp) > max_win) {
375	/ there is congestion, check that it is ok*
376	* to make a CA decision
377	* 1. We should have at least nv_dec_eval_min_calls
378	* data points before making a CA decision
379	* 2. We only make a congesion decision after
380	* nv_rtt_min_cnt RTTs
381	*/
382	if (ca->nv_rtt_cnt < nv_rtt_min_cnt) {
383	return;
384	} else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
385	if (ca->nv_eval_call_cnt <
386	nv_ssthresh_eval_min_calls)
387	return;
388	/ otherwise we will decrease cwnd /
389	} else if (ca->nv_eval_call_cnt <
390	nv_dec_eval_min_calls) {
391	if (ca->nv_allow_cwnd_growth &&
392	ca->nv_rtt_cnt > nv_stop_rtt_cnt)
393	ca->nv_allow_cwnd_growth = `0`;
394	return;
395	}
396
397	/ We have enough data to determine we are congested /
398	ca->nv_allow_cwnd_growth = `0`;
399	tp->snd_ssthresh =
400	(nv_ssthresh_factor * max_win) >> `3`;
401	if (tcp_snd_cwnd(tp) - max_win > `2`) {
402	/ gap > 2, we do exponential cwnd decrease /
403	int dec;
404
405	dec = max(`2U`, ((tcp_snd_cwnd(tp) - max_win) *
406	nv_cong_dec_mult) >> `7`);
407	tcp_snd_cwnd_set(tp, val: tcp_snd_cwnd(tp) - dec);
408	} else if (nv_cong_dec_mult > `0`) {
409	tcp_snd_cwnd_set(tp, val: max_win);
410	}
411	if (ca->cwnd_growth_factor > `0`)
412	ca->cwnd_growth_factor = `0`;
413	ca->nv_no_cong_cnt = `0`;
414	} else if (tcp_snd_cwnd(tp) <= max_win - nv_pad_buffer) {
415	/ There is no congestion, grow cwnd if allowed/
416	if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls)
417	return;
418
419	ca->nv_allow_cwnd_growth = `1`;
420	ca->nv_no_cong_cnt++;
421	if (ca->cwnd_growth_factor < `0` &&
422	nv_cwnd_growth_rate_neg > `0` &&
423	ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) {
424	ca->cwnd_growth_factor++;
425	ca->nv_no_cong_cnt = `0`;
426	} else if (ca->cwnd_growth_factor >= `0` &&
427	nv_cwnd_growth_rate_pos > `0` &&
428	ca->nv_no_cong_cnt >
429	nv_cwnd_growth_rate_pos) {
430	ca->cwnd_growth_factor++;
431	ca->nv_no_cong_cnt = `0`;
432	}
433	} else {
434	/ cwnd is in-between, so do nothing /
435	return;
436	}
437
438	/ update state /
439	ca->nv_eval_call_cnt = `0`;
440	ca->nv_rtt_cnt = `0`;
441	ca->nv_rtt_max_rate = `0`;
442
443	/ Don't want to make cwnd < nv_min_cwnd*
444	* (it wasn't before, if it is now is because nv
445	* decreased it).
446	*/
447	if (tcp_snd_cwnd(tp) < nv_min_cwnd)
448	tcp_snd_cwnd_set(tp, val: nv_min_cwnd);
449	}
450	}
451
452	/ Extract info for Tcp socket info provided via netlink /
453	static size_t tcpnv_get_info(struct sock sk, u32 ext, int* *attr,
454	union tcp_cc_info *info)
455	{
456	const struct tcpnv *ca = inet_csk_ca(sk);
457
458	if (ext & (`1` << (INET_DIAG_VEGASINFO - `1`))) {
459	info->vegas.tcpv_enabled = `1`;
460	info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
461	info->vegas.tcpv_rtt = ca->nv_last_rtt;
462	info->vegas.tcpv_minrtt = ca->nv_min_rtt;
463
464	*attr = INET_DIAG_VEGASINFO;
465	return sizeof(struct tcpvegas_info);
466	}
467	return `0`;
468	}
469
470	static struct tcp_congestion_ops tcpnv __read_mostly = {
471	.init = tcpnv_init,
472	.ssthresh = tcpnv_recalc_ssthresh,
473	.cong_avoid = tcpnv_cong_avoid,
474	.set_state = tcpnv_state,
475	.undo_cwnd = tcp_reno_undo_cwnd,
476	.pkts_acked = tcpnv_acked,
477	.get_info = tcpnv_get_info,
478
479	.owner = THIS_MODULE,
480	.name = "nv",
481	};
482
483	static int __init tcpnv_register(void)
484	{
485	BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
486
487	return tcp_register_congestion_control(type: &tcpnv);
488	}
489
490	static void __exit tcpnv_unregister(void)
491	{
492	tcp_unregister_congestion_control(type: &tcpnv);
493	}
494
495	module_init(tcpnv_register);
496	module_exit(tcpnv_unregister);
497
498	MODULE_AUTHOR("Lawrence Brakmo");
499	MODULE_LICENSE("GPL");
500	MODULE_DESCRIPTION("TCP NV");
501	MODULE_VERSION("1.0");
502

source code of linux/net/ipv4/tcp_nv.c