tcp_cong.c source code [linux/net/ipv4/tcp_cong.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Pluggable TCP congestion control support and newReno
4	* congestion control.
5	* Based on ideas from I/O scheduler support and Web100.
6	*
7	* Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
8	*/
9
10	#define pr_fmt(fmt) "TCP: " fmt
11
12	#include <linux/module.h>
13	#include <linux/mm.h>
14	#include <linux/types.h>
15	#include <linux/list.h>
16	#include <linux/gfp.h>
17	#include <linux/jhash.h>
18	#include <net/tcp.h>
19	#include <trace/events/tcp.h>
20
21	static DEFINE_SPINLOCK(tcp_cong_list_lock);
22	static LIST_HEAD(tcp_cong_list);
23
24	/ Simple linear search, don't expect many entries! /
25	struct tcp_congestion_ops tcp_ca_find(const* char *name)
26	{
27	struct tcp_congestion_ops *e;
28
29	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
30	if (strcmp(e->name, name) == `0`)
31	return e;
32	}
33
34	return NULL;
35	}
36
37	void tcp_set_ca_state(struct sock sk, const* u8 ca_state)
38	{
39	struct inet_connection_sock *icsk = inet_csk(sk);
40
41	trace_tcp_cong_state_set(sk, ca_state);
42
43	if (icsk->icsk_ca_ops->set_state)
44	icsk->icsk_ca_ops->set_state(sk, ca_state);
45	icsk->icsk_ca_state = ca_state;
46	}
47
48	/ Must be called with rcu lock held /
49	static struct tcp_congestion_ops tcp_ca_find_autoload(struct* net *net,
50	const char *name)
51	{
52	struct tcp_congestion_ops *ca = tcp_ca_find(name);
53
54	#ifdef CONFIG_MODULES
55	if (!ca && capable(CAP_NET_ADMIN)) {
56	rcu_read_unlock();
57	request_module("tcp_%s", name);
58	rcu_read_lock();
59	ca = tcp_ca_find(name);
60	}
61	#endif
62	return ca;
63	}
64
65	/ Simple linear search, not much in here. /
66	struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
67	{
68	struct tcp_congestion_ops *e;
69
70	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
71	if (e->key == key)
72	return e;
73	}
74
75	return NULL;
76	}
77
78	int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
79	{
80	/ all algorithms must implement these /
81	if (!ca->ssthresh \|\| !ca->undo_cwnd \|\|
82	!(ca->cong_avoid \|\| ca->cong_control)) {
83	pr_err("%s does not implement required ops\n", ca->name);
84	return -EINVAL;
85	}
86
87	return `0`;
88	}
89
90	/ Attach new congestion control algorithm to the list*
91	* of available options.
92	*/
93	int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
94	{
95	int ret;
96
97	ret = tcp_validate_congestion_control(ca);
98	if (ret)
99	return ret;
100
101	ca->key = jhash(key: ca->name, length: sizeof(ca->name), strlen(ca->name));
102
103	spin_lock(lock: &tcp_cong_list_lock);
104	if (ca->key == TCP_CA_UNSPEC \|\| tcp_ca_find_key(key: ca->key)) {
105	pr_notice("%s already registered or non-unique key\n",
106	ca->name);
107	ret = -EEXIST;
108	} else {
109	list_add_tail_rcu(new: &ca->list, head: &tcp_cong_list);
110	pr_debug("%s registered\n", ca->name);
111	}
112	spin_unlock(lock: &tcp_cong_list_lock);
113
114	return ret;
115	}
116	EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
117
118	/*
119	* Remove congestion control algorithm, called from
120	* the module's remove function. Module ref counts are used
121	* to ensure that this can't be done till all sockets using
122	* that method are closed.
123	*/
124	void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
125	{
126	spin_lock(lock: &tcp_cong_list_lock);
127	list_del_rcu(entry: &ca->list);
128	spin_unlock(lock: &tcp_cong_list_lock);
129
130	/ Wait for outstanding readers to complete before the*
131	* module gets removed entirely.
132	*
133	* A try_module_get() should fail by now as our module is
134	* in "going" state since no refs are held anymore and
135	* module_exit() handler being called.
136	*/
137	synchronize_rcu();
138	}
139	EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
140
141	/ Replace a registered old ca with a new one.*
142	*
143	* The new ca must have the same name as the old one, that has been
144	* registered.
145	*/
146	int tcp_update_congestion_control(struct tcp_congestion_ops ca, struct* tcp_congestion_ops *old_ca)
147	{
148	struct tcp_congestion_ops *existing;
149	int ret;
150
151	ret = tcp_validate_congestion_control(ca);
152	if (ret)
153	return ret;
154
155	ca->key = jhash(key: ca->name, length: sizeof(ca->name), strlen(ca->name));
156
157	spin_lock(lock: &tcp_cong_list_lock);
158	existing = tcp_ca_find_key(key: old_ca->key);
159	if (ca->key == TCP_CA_UNSPEC \|\| !existing \|\| strcmp(existing->name, ca->name)) {
160	pr_notice("%s not registered or non-unique key\n",
161	ca->name);
162	ret = -EINVAL;
163	} else if (existing != old_ca) {
164	pr_notice("invalid old congestion control algorithm to replace\n");
165	ret = -EINVAL;
166	} else {
167	/ Add the new one before removing the old one to keep*
168	* one implementation available all the time.
169	*/
170	list_add_tail_rcu(new: &ca->list, head: &tcp_cong_list);
171	list_del_rcu(entry: &existing->list);
172	pr_debug("%s updated\n", ca->name);
173	}
174	spin_unlock(lock: &tcp_cong_list_lock);
175
176	/ Wait for outstanding readers to complete before the*
177	* module or struct_ops gets removed entirely.
178	*/
179	if (!ret)
180	synchronize_rcu();
181
182	return ret;
183	}
184
185	u32 tcp_ca_get_key_by_name(struct net net, const* char name, bool ecn_ca)
186	{
187	const struct tcp_congestion_ops *ca;
188	u32 key = TCP_CA_UNSPEC;
189
190	might_sleep();
191
192	rcu_read_lock();
193	ca = tcp_ca_find_autoload(net, name);
194	if (ca) {
195	key = ca->key;
196	*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
197	}
198	rcu_read_unlock();
199
200	return key;
201	}
202
203	char tcp_ca_get_name_by_key(u32 key, char* *buffer)
204	{
205	const struct tcp_congestion_ops *ca;
206	char *ret = NULL;
207
208	rcu_read_lock();
209	ca = tcp_ca_find_key(key);
210	if (ca)
211	ret = strncpy(p: buffer, q: ca->name,
212	TCP_CA_NAME_MAX);
213	rcu_read_unlock();
214
215	return ret;
216	}
217
218	/ Assign choice of congestion control. /
219	void tcp_assign_congestion_control(struct sock *sk)
220	{
221	struct net *net = sock_net(sk);
222	struct inet_connection_sock *icsk = inet_csk(sk);
223	const struct tcp_congestion_ops *ca;
224
225	rcu_read_lock();
226	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
227	if (unlikely(!bpf_try_module_get(ca, ca->owner)))
228	ca = &tcp_reno;
229	icsk->icsk_ca_ops = ca;
230	rcu_read_unlock();
231
232	memset(icsk->icsk_ca_priv, `0`, sizeof(icsk->icsk_ca_priv));
233	if (ca->flags & TCP_CONG_NEEDS_ECN)
234	INET_ECN_xmit(sk);
235	else
236	INET_ECN_dontxmit(sk);
237	}
238
239	void tcp_init_congestion_control(struct sock *sk)
240	{
241	struct inet_connection_sock *icsk = inet_csk(sk);
242
243	tcp_sk(sk)->prior_ssthresh = `0`;
244	if (icsk->icsk_ca_ops->init)
245	icsk->icsk_ca_ops->init(sk);
246	if (tcp_ca_needs_ecn(sk))
247	INET_ECN_xmit(sk);
248	else
249	INET_ECN_dontxmit(sk);
250	icsk->icsk_ca_initialized = `1`;
251	}
252
253	static void tcp_reinit_congestion_control(struct sock *sk,
254	const struct tcp_congestion_ops *ca)
255	{
256	struct inet_connection_sock *icsk = inet_csk(sk);
257
258	tcp_cleanup_congestion_control(sk);
259	icsk->icsk_ca_ops = ca;
260	icsk->icsk_ca_setsockopt = `1`;
261	memset(icsk->icsk_ca_priv, `0`, sizeof(icsk->icsk_ca_priv));
262
263	if (ca->flags & TCP_CONG_NEEDS_ECN)
264	INET_ECN_xmit(sk);
265	else
266	INET_ECN_dontxmit(sk);
267
268	if (!((`1` << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
269	tcp_init_congestion_control(sk);
270	}
271
272	/ Manage refcounts on socket close. /
273	void tcp_cleanup_congestion_control(struct sock *sk)
274	{
275	struct inet_connection_sock *icsk = inet_csk(sk);
276
277	if (icsk->icsk_ca_ops->release)
278	icsk->icsk_ca_ops->release(sk);
279	bpf_module_put(data: icsk->icsk_ca_ops, owner: icsk->icsk_ca_ops->owner);
280	}
281
282	/ Used by sysctl to change default congestion control /
283	int tcp_set_default_congestion_control(struct net net, const* char *name)
284	{
285	struct tcp_congestion_ops *ca;
286	const struct tcp_congestion_ops *prev;
287	int ret;
288
289	rcu_read_lock();
290	ca = tcp_ca_find_autoload(net, name);
291	if (!ca) {
292	ret = -ENOENT;
293	} else if (!bpf_try_module_get(data: ca, owner: ca->owner)) {
294	ret = -EBUSY;
295	} else if (!net_eq(net1: net, net2: &init_net) &&
296	!(ca->flags & TCP_CONG_NON_RESTRICTED)) {
297	/ Only init netns can set default to a restricted algorithm /
298	ret = -EPERM;
299	} else {
300	prev = xchg(&net->ipv4.tcp_congestion_control, ca);
301	if (prev)
302	bpf_module_put(data: prev, owner: prev->owner);
303
304	ca->flags \|= TCP_CONG_NON_RESTRICTED;
305	ret = `0`;
306	}
307	rcu_read_unlock();
308
309	return ret;
310	}
311
312	/ Set default value from kernel configuration at bootup /
313	static int __init tcp_congestion_default(void)
314	{
315	return tcp_set_default_congestion_control(net: &init_net,
316	CONFIG_DEFAULT_TCP_CONG);
317	}
318	late_initcall(tcp_congestion_default);
319
320	/ Build string with list of available congestion control values /
321	void tcp_get_available_congestion_control(char *buf, size_t maxlen)
322	{
323	struct tcp_congestion_ops *ca;
324	size_t offs = `0`;
325
326	rcu_read_lock();
327	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
328	offs += snprintf(buf: buf + offs, size: maxlen - offs,
329	fmt: "%s%s",
330	offs == `0` ? "" : " ", ca->name);
331
332	if (WARN_ON_ONCE(offs >= maxlen))
333	break;
334	}
335	rcu_read_unlock();
336	}
337
338	/ Get current default congestion control /
339	void tcp_get_default_congestion_control(struct net net, char* *name)
340	{
341	const struct tcp_congestion_ops *ca;
342
343	rcu_read_lock();
344	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
345	strncpy(p: name, q: ca->name, TCP_CA_NAME_MAX);
346	rcu_read_unlock();
347	}
348
349	/ Built list of non-restricted congestion control values /
350	void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
351	{
352	struct tcp_congestion_ops *ca;
353	size_t offs = `0`;
354
355	*buf = `'\0'`;
356	rcu_read_lock();
357	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
358	if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
359	continue;
360	offs += snprintf(buf: buf + offs, size: maxlen - offs,
361	fmt: "%s%s",
362	offs == `0` ? "" : " ", ca->name);
363
364	if (WARN_ON_ONCE(offs >= maxlen))
365	break;
366	}
367	rcu_read_unlock();
368	}
369
370	/ Change list of non-restricted congestion control /
371	int tcp_set_allowed_congestion_control(char *val)
372	{
373	struct tcp_congestion_ops *ca;
374	char saved_clone, clone, *name;
375	int ret = `0`;
376
377	saved_clone = clone = kstrdup(s: val, GFP_USER);
378	if (!clone)
379	return -ENOMEM;
380
381	spin_lock(lock: &tcp_cong_list_lock);
382	/ pass 1 check for bad entries /
383	while ((name = strsep(&clone, " ")) && *name) {
384	ca = tcp_ca_find(name);
385	if (!ca) {
386	ret = -ENOENT;
387	goto out;
388	}
389	}
390
391	/ pass 2 clear old values /
392	list_for_each_entry_rcu(ca, &tcp_cong_list, list)
393	ca->flags &= ~TCP_CONG_NON_RESTRICTED;
394
395	/ pass 3 mark as allowed /
396	while ((name = strsep(&val, " ")) && *name) {
397	ca = tcp_ca_find(name);
398	WARN_ON(!ca);
399	if (ca)
400	ca->flags \|= TCP_CONG_NON_RESTRICTED;
401	}
402	out:
403	spin_unlock(lock: &tcp_cong_list_lock);
404	kfree(objp: saved_clone);
405
406	return ret;
407	}
408
409	/ Change congestion control for socket. If load is false, then it is the*
410	* responsibility of the caller to call tcp_init_congestion_control or
411	* tcp_reinit_congestion_control (if the current congestion control was
412	* already initialized.
413	*/
414	int tcp_set_congestion_control(struct sock sk, const* char *name, bool load,
415	bool cap_net_admin)
416	{
417	struct inet_connection_sock *icsk = inet_csk(sk);
418	const struct tcp_congestion_ops *ca;
419	int err = `0`;
420
421	if (icsk->icsk_ca_dst_locked)
422	return -EPERM;
423
424	rcu_read_lock();
425	if (!load)
426	ca = tcp_ca_find(name);
427	else
428	ca = tcp_ca_find_autoload(net: sock_net(sk), name);
429
430	/ No change asking for existing value /
431	if (ca == icsk->icsk_ca_ops) {
432	icsk->icsk_ca_setsockopt = `1`;
433	goto out;
434	}
435
436	if (!ca)
437	err = -ENOENT;
438	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) \|\| cap_net_admin))
439	err = -EPERM;
440	else if (!bpf_try_module_get(data: ca, owner: ca->owner))
441	err = -EBUSY;
442	else
443	tcp_reinit_congestion_control(sk, ca);
444	out:
445	rcu_read_unlock();
446	return err;
447	}
448
449	/ Slow start is used when congestion window is no greater than the slow start*
450	* threshold. We base on RFC2581 and also handle stretch ACKs properly.
451	* We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
452	* something better;) a packet is only considered (s)acked in its entirety to
453	* defend the ACK attacks described in the RFC. Slow start processes a stretch
454	* ACK of degree N as if N acks of degree 1 are received back to back except
455	* ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
456	* returns the leftover acks to adjust cwnd in congestion avoidance mode.
457	*/
458	__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
459	{
460	u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
461
462	acked -= cwnd - tcp_snd_cwnd(tp);
463	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
464
465	return acked;
466	}
467	EXPORT_SYMBOL_GPL(tcp_slow_start);
468
469	/ In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),*
470	* for every packet that was ACKed.
471	*/
472	__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
473	{
474	/ If credits accumulated at a higher w, apply them gently now. /
475	if (tp->snd_cwnd_cnt >= w) {
476	tp->snd_cwnd_cnt = `0`;
477	tcp_snd_cwnd_set(tp, val: tcp_snd_cwnd(tp) + `1`);
478	}
479
480	tp->snd_cwnd_cnt += acked;
481	if (tp->snd_cwnd_cnt >= w) {
482	u32 delta = tp->snd_cwnd_cnt / w;
483
484	tp->snd_cwnd_cnt -= delta * w;
485	tcp_snd_cwnd_set(tp, val: tcp_snd_cwnd(tp) + delta);
486	}
487	tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
488	}
489	EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
490
491	/*
492	* TCP Reno congestion control
493	* This is special case used for fallback as well.
494	*/
495	/ This is Jacobson's slow start and congestion avoidance.*
496	* SIGCOMM '88, p. 328.
497	*/
498	__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
499	{
500	struct tcp_sock *tp = tcp_sk(sk);
501
502	if (!tcp_is_cwnd_limited(sk))
503	return;
504
505	/ In "safe" area, increase. /
506	if (tcp_in_slow_start(tp)) {
507	acked = tcp_slow_start(tp, acked);
508	if (!acked)
509	return;
510	}
511	/ In dangerous area, increase slowly. /
512	tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
513	}
514	EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
515
516	/ Slow start threshold is half the congestion window (min 2) /
517	__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk)
518	{
519	const struct tcp_sock *tp = tcp_sk(sk);
520
521	return max(tcp_snd_cwnd(tp) >> `1U`, `2U`);
522	}
523	EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
524
525	__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk)
526	{
527	const struct tcp_sock *tp = tcp_sk(sk);
528
529	return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
530	}
531	EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
532
533	struct tcp_congestion_ops tcp_reno = {
534	.flags = TCP_CONG_NON_RESTRICTED,
535	.name = "reno",
536	.owner = THIS_MODULE,
537	.ssthresh = tcp_reno_ssthresh,
538	.cong_avoid = tcp_reno_cong_avoid,
539	.undo_cwnd = tcp_reno_undo_cwnd,
540	};
541

source code of linux/net/ipv4/tcp_cong.c