tcp_bbr.c source code [linux/net/ipv4/tcp_bbr.c]

1	/ Bottleneck Bandwidth and RTT (BBR) congestion control*
2	*
3	* BBR congestion control computes the sending rate based on the delivery
4	* rate (throughput) estimated from ACKs. In a nutshell:
5	*
6	* On each ACK, update our model of the network path:
7	* bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
8	* min_rtt = windowed_min(rtt, 10 seconds)
9	* pacing_rate = pacing_gain * bottleneck_bandwidth
10	* cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
11	*
12	* The core algorithm does not react directly to packet losses or delays,
13	* although BBR may adjust the size of next send per ACK when loss is
14	* observed, or adjust the sending rate if it estimates there is a
15	* traffic policer, in order to keep the drop rate reasonable.
16	*
17	* Here is a state transition diagram for BBR:
18	*
19	* \|
20	* V
21	* +---> STARTUP ----+
22	* \| \| \|
23	* \| V \|
24	* \| DRAIN ----+
25	* \| \| \|
26	* \| V \|
27	* +---> PROBE_BW ----+
28	* \| ^ \| \|
29	* \| \| \| \|
30	* \| +----+ \|
31	* \| \|
32	* +---- PROBE_RTT <--+
33	*
34	* A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
35	* When it estimates the pipe is full, it enters DRAIN to drain the queue.
36	* In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
37	* A long-lived BBR flow spends the vast majority of its time remaining
38	* (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
39	* in a fair manner, with a small, bounded queue. If a flow has been
40	* continuously sending for the entire min_rtt window, and hasn't seen an RTT
41	* sample that matches or decreases its min_rtt estimate for 10 seconds, then
42	* it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
43	* the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
44	* we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
45	* otherwise we enter STARTUP to try to fill the pipe.
46	*
47	* BBR is described in detail in:
48	* "BBR: Congestion-Based Congestion Control",
49	* Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
50	* Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
51	*
52	* There is a public e-mail list for discussing BBR development and testing:
53	* https://groups.google.com/forum/#!forum/bbr-dev
54	*
55	* NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
56	* otherwise TCP stack falls back to an internal pacing using one high
57	* resolution timer per TCP socket and may use more resources.
58	*/
59	#include <linux/btf.h>
60	#include <linux/btf_ids.h>
61	#include <linux/module.h>
62	#include <net/tcp.h>
63	#include <linux/inet_diag.h>
64	#include <linux/inet.h>
65	#include <linux/random.h>
66	#include <linux/win_minmax.h>
67
68	/ Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth*
69	* estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
70	* This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
71	* Since the minimum window is >=4 packets, the lower bound isn't
72	* an issue. The upper bound isn't an issue with existing technologies.
73	*/
74	#define BW_SCALE 24
75	#define BW_UNIT (1 << BW_SCALE)
76
77	#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
78	#define BBR_UNIT (1 << BBR_SCALE)
79
80	/ BBR has the following modes for deciding how fast to send: /
81	enum bbr_mode {
82	BBR_STARTUP, / ramp up sending rate rapidly to fill pipe /
83	BBR_DRAIN, / drain any queue created during startup /
84	BBR_PROBE_BW, / discover, share bw: pace around estimated bw /
85	BBR_PROBE_RTT, / cut inflight to min to probe min_rtt /
86	};
87
88	/ BBR congestion control block /
89	struct bbr {
90	u32 min_rtt_us; / min RTT in min_rtt_win_sec window /
91	u32 min_rtt_stamp; / timestamp of min_rtt_us /
92	u32 probe_rtt_done_stamp; / end time for BBR_PROBE_RTT mode /
93	struct minmax bw; / Max recent delivery rate in pkts/uS << 24 /
94	u32 rtt_cnt; / count of packet-timed rounds elapsed /
95	u32 next_rtt_delivered; / scb->tx.delivered at end of round /
96	u64 cycle_mstamp; / time of this cycle phase start /
97	u32 mode:`3`, / current bbr_mode in state machine /
98	prev_ca_state:`3`, / CA state on previous ACK /
99	packet_conservation:`1`, / use packet conservation? /
100	round_start:`1`, / start of packet-timed tx->ack round? /
101	idle_restart:`1`, / restarting after idle? /
102	probe_rtt_round_done:`1`, / a BBR_PROBE_RTT round at 4 pkts? /
103	unused:`13`,
104	lt_is_sampling:`1`, / taking long-term ("LT") samples now? /
105	lt_rtt_cnt:`7`, / round trips in long-term interval /
106	lt_use_bw:`1`; / use lt_bw as our bw estimate? /
107	u32 lt_bw; / LT est delivery rate in pkts/uS << 24 /
108	u32 lt_last_delivered; / LT intvl start: tp->delivered /
109	u32 lt_last_stamp; / LT intvl start: tp->delivered_mstamp /
110	u32 lt_last_lost; / LT intvl start: tp->lost /
111	u32 pacing_gain:`10`, / current gain for setting pacing rate /
112	cwnd_gain:`10`, / current gain for setting cwnd /
113	full_bw_reached:`1`, / reached full bw in Startup? /
114	full_bw_cnt:`2`, / number of rounds without large bw gains /
115	cycle_idx:`3`, / current index in pacing_gain cycle array /
116	has_seen_rtt:`1`, / have we seen an RTT sample yet? /
117	unused_b:`5`;
118	u32 prior_cwnd; / prior cwnd upon entering loss recovery /
119	u32 full_bw; / recent bw, to estimate if pipe is full /
120
121	/ For tracking ACK aggregation: /
122	u64 ack_epoch_mstamp; / start of ACK sampling epoch /
123	u16 extra_acked[`2`]; / max excess data ACKed in epoch /
124	u32 ack_epoch_acked:`20`, / packets (S)ACKed in sampling epoch /
125	extra_acked_win_rtts:`5`, / age of extra_acked, in round trips /
126	extra_acked_win_idx:`1`, / current index in extra_acked array /
127	unused_c:`6`;
128	};
129
130	#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
131
132	/ Window length of bw filter (in rounds): /
133	static const int bbr_bw_rtts = CYCLE_LEN + `2`;
134	/ Window length of min_rtt filter (in sec): /
135	static const u32 bbr_min_rtt_win_sec = `10`;
136	/ Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: /
137	static const u32 bbr_probe_rtt_mode_ms = `200`;
138	/ Skip TSO below the following bandwidth (bits/sec): /
139	static const int bbr_min_tso_rate = `1200000`;
140
141	/ Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.*
142	* In order to help drive the network toward lower queues and low latency while
143	* maintaining high utilization, the average pacing rate aims to be slightly
144	* lower than the estimated bandwidth. This is an important aspect of the
145	* design.
146	*/
147	static const int bbr_pacing_margin_percent = `1`;
148
149	/ We use a high_gain value of 2/ln(2) because it's the smallest pacing gain*
150	* that will allow a smoothly increasing pacing rate that will double each RTT
151	* and send the same number of packets per RTT that an un-paced, slow-starting
152	* Reno or CUBIC flow would:
153	*/
154	static const int bbr_high_gain = BBR_UNIT * `2885` / `1000` + `1`;
155	/ The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain*
156	* the queue created in BBR_STARTUP in a single round:
157	*/
158	static const int bbr_drain_gain = BBR_UNIT * `1000` / `2885`;
159	/ The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: /
160	static const int bbr_cwnd_gain = BBR_UNIT * `2`;
161	/ The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: /
162	static const int bbr_pacing_gain[] = {
163	BBR_UNIT * `5` / `4`, / probe for more available bw /
164	BBR_UNIT * `3` / `4`, / drain queue and/or yield bw to other flows /
165	BBR_UNIT, BBR_UNIT, BBR_UNIT, / cruise at 1.0bw to utilize pipe, /*
166	BBR_UNIT, BBR_UNIT, BBR_UNIT / without creating excess queue... /
167	};
168	/ Randomize the starting gain cycling phase over N phases: /
169	static const u32 bbr_cycle_rand = `7`;
170
171	/ Try to keep at least this many packets in flight, if things go smoothly. For*
172	* smooth functioning, a sliding window protocol ACKing every other packet
173	* needs at least 4 packets in flight:
174	*/
175	static const u32 bbr_cwnd_min_target = `4`;
176
177	/ To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... /
178	/ If bw has increased significantly (1.25x), there may be more bw available: /
179	static const u32 bbr_full_bw_thresh = BBR_UNIT * `5` / `4`;
180	/ But after 3 rounds w/o significant bw growth, estimate pipe is full: /
181	static const u32 bbr_full_bw_cnt = `3`;
182
183	/ "long-term" ("LT") bandwidth estimator parameters... /
184	/ The minimum number of rounds in an LT bw sampling interval: /
185	static const u32 bbr_lt_intvl_min_rtts = `4`;
186	/ If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: /
187	static const u32 bbr_lt_loss_thresh = `50`;
188	/ If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": /
189	static const u32 bbr_lt_bw_ratio = BBR_UNIT / `8`;
190	/ If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": /
191	static const u32 bbr_lt_bw_diff = `4000` / `8`;
192	/ If we estimate we're policed, use lt_bw for this many round trips: /
193	static const u32 bbr_lt_bw_max_rtts = `48`;
194
195	/ Gain factor for adding extra_acked to target cwnd: /
196	static const int bbr_extra_acked_gain = BBR_UNIT;
197	/ Window length of extra_acked window. /
198	static const u32 bbr_extra_acked_win_rtts = `5`;
199	/ Max allowed val for ack_epoch_acked, after which sampling epoch is reset /
200	static const u32 bbr_ack_epoch_acked_reset_thresh = `1U` << `20`;
201	/ Time period for clamping cwnd increment due to ack aggregation /
202	static const u32 bbr_extra_acked_max_us = `100` * `1000`;
203
204	static void bbr_check_probe_rtt_done(struct sock *sk);
205
206	/ Do we estimate that STARTUP filled the pipe? /
207	static bool bbr_full_bw_reached(const struct sock *sk)
208	{
209	const struct bbr *bbr = inet_csk_ca(sk);
210
211	return bbr->full_bw_reached;
212	}
213
214	/ Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. /
215	static u32 bbr_max_bw(const struct sock *sk)
216	{
217	struct bbr *bbr = inet_csk_ca(sk);
218
219	return minmax_get(m: &bbr->bw);
220	}
221
222	/ Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. /
223	static u32 bbr_bw(const struct sock *sk)
224	{
225	struct bbr *bbr = inet_csk_ca(sk);
226
227	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
228	}
229
230	/ Return maximum extra acked in past k-2k round trips,*
231	* where k = bbr_extra_acked_win_rtts.
232	*/
233	static u16 bbr_extra_acked(const struct sock *sk)
234	{
235	struct bbr *bbr = inet_csk_ca(sk);
236
237	return max(bbr->extra_acked[`0`], bbr->extra_acked[`1`]);
238	}
239
240	/ Return rate in bytes per second, optionally with a gain.*
241	* The order here is chosen carefully to avoid overflow of u64. This should
242	* work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
243	*/
244	static u64 bbr_rate_bytes_per_sec(struct sock sk, u64 rate, int* gain)
245	{
246	unsigned int mss = tcp_sk(sk)->mss_cache;
247
248	rate *= mss;
249	rate *= gain;
250	rate >>= BBR_SCALE;
251	rate = USEC_PER_SEC / `100` (`100` - bbr_pacing_margin_percent);
252	return rate >> BW_SCALE;
253	}
254
255	/ Convert a BBR bw and gain factor to a pacing rate in bytes per second. /
256	static unsigned long bbr_bw_to_pacing_rate(struct sock sk, u32 bw, int* gain)
257	{
258	u64 rate = bw;
259
260	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
261	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
262	return rate;
263	}
264
265	/ Initialize pacing rate to: high_gain * init_cwnd / RTT. /
266	static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
267	{
268	struct tcp_sock *tp = tcp_sk(sk);
269	struct bbr *bbr = inet_csk_ca(sk);
270	u64 bw;
271	u32 rtt_us;
272
273	if (tp->srtt_us) { / any RTT sample yet? /
274	rtt_us = max(tp->srtt_us >> `3`, `1U`);
275	bbr->has_seen_rtt = `1`;
276	} else { / no RTT sample yet /
277	rtt_us = USEC_PER_MSEC; / use nominal default RTT /
278	}
279	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
280	do_div(bw, rtt_us);
281	WRITE_ONCE(sk->sk_pacing_rate,
282	bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
283	}
284
285	/ Pace using current bw estimate and a gain factor. /
286	static void bbr_set_pacing_rate(struct sock sk, u32 bw, int* gain)
287	{
288	struct tcp_sock *tp = tcp_sk(sk);
289	struct bbr *bbr = inet_csk_ca(sk);
290	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
291
292	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
293	bbr_init_pacing_rate_from_rtt(sk);
294	if (bbr_full_bw_reached(sk) \|\| rate > READ_ONCE(sk->sk_pacing_rate))
295	WRITE_ONCE(sk->sk_pacing_rate, rate);
296	}
297
298	/ override sysctl_tcp_min_tso_segs /
299	__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
300	{
301	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> `3`) ? `1` : `2`;
302	}
303
304	static u32 bbr_tso_segs_goal(struct sock *sk)
305	{
306	struct tcp_sock *tp = tcp_sk(sk);
307	u32 segs, bytes;
308
309	/ Sort of tcp_tso_autosize() but ignoring*
310	* driver provided sk_gso_max_size.
311	*/
312	bytes = min_t(unsigned long,
313	READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
314	GSO_LEGACY_MAX_SIZE - `1` - MAX_TCP_HEADER);
315	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
316
317	return min(segs, `0x7FU`);
318	}
319
320	/ Save "last known good" cwnd so we can restore it after losses or PROBE_RTT /
321	static void bbr_save_cwnd(struct sock *sk)
322	{
323	struct tcp_sock *tp = tcp_sk(sk);
324	struct bbr *bbr = inet_csk_ca(sk);
325
326	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
327	bbr->prior_cwnd = tcp_snd_cwnd(tp); / this cwnd is good enough /
328	else / loss recovery or BBR_PROBE_RTT have temporarily cut cwnd /
329	bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp));
330	}
331
332	__bpf_kfunc static void bbr_cwnd_event(struct sock sk, enum* tcp_ca_event event)
333	{
334	struct tcp_sock *tp = tcp_sk(sk);
335	struct bbr *bbr = inet_csk_ca(sk);
336
337	if (event == CA_EVENT_TX_START && tp->app_limited) {
338	bbr->idle_restart = `1`;
339	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
340	bbr->ack_epoch_acked = `0`;
341	/ Avoid pointless buffer overflows: pace at est. bw if we don't*
342	* need more speed (we're restarting from idle and app-limited).
343	*/
344	if (bbr->mode == BBR_PROBE_BW)
345	bbr_set_pacing_rate(sk, bw: bbr_bw(sk), BBR_UNIT);
346	else if (bbr->mode == BBR_PROBE_RTT)
347	bbr_check_probe_rtt_done(sk);
348	}
349	}
350
351	/ Calculate bdp based on min RTT and the estimated bottleneck bandwidth:*
352	*
353	* bdp = ceil(bw * min_rtt * gain)
354	*
355	* The key factor, gain, controls the amount of queue. While a small gain
356	* builds a smaller queue, it becomes more vulnerable to noise in RTT
357	* measurements (e.g., delayed ACKs or other ACK compression effects). This
358	* noise may cause BBR to under-estimate the rate.
359	*/
360	static u32 bbr_bdp(struct sock sk, u32 bw, int* gain)
361	{
362	struct bbr *bbr = inet_csk_ca(sk);
363	u32 bdp;
364	u64 w;
365
366	/ If we've never had a valid RTT sample, cap cwnd at the initial*
367	* default. This should only happen when the connection is not using TCP
368	* timestamps and has retransmitted all of the SYN/SYNACK/data packets
369	* ACKed so far. In this case, an RTO can cut cwnd to 1, in which
370	* case we need to slow-start up toward something safe: TCP_INIT_CWND.
371	*/
372	if (unlikely(bbr->min_rtt_us == ~`0U`)) / no valid RTT samples yet? /
373	return TCP_INIT_CWND; / be safe: cap at default initial cwnd/
374
375	w = (u64)bw * bbr->min_rtt_us;
376
377	/ Apply a gain to the given value, remove the BW_SCALE shift, and*
378	* round the value up to avoid a negative feedback loop.
379	*/
380	bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - `1`) / BW_UNIT;
381
382	return bdp;
383	}
384
385	/ To achieve full performance in high-speed paths, we budget enough cwnd to*
386	* fit full-sized skbs in-flight on both end hosts to fully utilize the path:
387	* - one skb in sending host Qdisc,
388	* - one skb in sending host TSO/GSO engine
389	* - one skb being received by receiver host LRO/GRO/delayed-ACK engine
390	* Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
391	* in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
392	* which allows 2 outstanding 2-packet sequences, to try to keep pipe
393	* full even with ACK-every-other-packet delayed ACKs.
394	*/
395	static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
396	{
397	struct bbr *bbr = inet_csk_ca(sk);
398
399	/ Allow enough full-sized skbs in flight to utilize end systems. /
400	cwnd += `3` * bbr_tso_segs_goal(sk);
401
402	/ Reduce delayed ACKs by rounding up cwnd to the next even number. /
403	cwnd = (cwnd + `1`) & ~`1U`;
404
405	/ Ensure gain cycling gets inflight above BDP even for small BDPs. /
406	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == `0`)
407	cwnd += `2`;
408
409	return cwnd;
410	}
411
412	/ Find inflight based on min RTT and the estimated bottleneck bandwidth. /
413	static u32 bbr_inflight(struct sock sk, u32 bw, int* gain)
414	{
415	u32 inflight;
416
417	inflight = bbr_bdp(sk, bw, gain);
418	inflight = bbr_quantization_budget(sk, cwnd: inflight);
419
420	return inflight;
421	}
422
423	/ With pacing at lower layers, there's often less data "in the network" than*
424	* "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
425	* we often have several skbs queued in the pacing layer with a pre-scheduled
426	* earliest departure time (EDT). BBR adapts its pacing rate based on the
427	* inflight level that it estimates has already been "baked in" by previous
428	* departure time decisions. We calculate a rough estimate of the number of our
429	* packets that might be in the network at the earliest departure time for the
430	* next skb scheduled:
431	* in_network_at_edt = inflight_at_edt - (EDT - now) * bw
432	* If we're increasing inflight, then we want to know if the transmit of the
433	* EDT skb will push inflight above the target, so inflight_at_edt includes
434	* bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
435	* then estimate if inflight will sink too low just before the EDT transmit.
436	*/
437	static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
438	{
439	struct tcp_sock *tp = tcp_sk(sk);
440	struct bbr *bbr = inet_csk_ca(sk);
441	u64 now_ns, edt_ns, interval_us;
442	u32 interval_delivered, inflight_at_edt;
443
444	now_ns = tp->tcp_clock_cache;
445	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
446	interval_us = div_u64(dividend: edt_ns - now_ns, NSEC_PER_USEC);
447	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
448	inflight_at_edt = inflight_now;
449	if (bbr->pacing_gain > BBR_UNIT) / increasing inflight /
450	inflight_at_edt += bbr_tso_segs_goal(sk); / include EDT skb /
451	if (interval_delivered >= inflight_at_edt)
452	return `0`;
453	return inflight_at_edt - interval_delivered;
454	}
455
456	/ Find the cwnd increment based on estimate of ack aggregation /
457	static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
458	{
459	u32 max_aggr_cwnd, aggr_cwnd = `0`;
460
461	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
462	max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
463	/ BW_UNIT;
464	aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
465	>> BBR_SCALE;
466	aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
467	}
468
469	return aggr_cwnd;
470	}
471
472	/ An optimization in BBR to reduce losses: On the first round of recovery, we*
473	* follow the packet conservation principle: send P packets per P packets acked.
474	* After that, we slow-start and send at most 2*P packets per P packets acked.
475	* After recovery finishes, or upon undo, we restore the cwnd we had when
476	* recovery started (capped by the target cwnd based on estimated BDP).
477	*
478	* TODO(ycheng/ncardwell): implement a rate-based approach.
479	*/
480	static bool bbr_set_cwnd_to_recover_or_restore(
481	struct sock sk, const* struct rate_sample rs, u32 acked, u32 new_cwnd)
482	{
483	struct tcp_sock *tp = tcp_sk(sk);
484	struct bbr *bbr = inet_csk_ca(sk);
485	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
486	u32 cwnd = tcp_snd_cwnd(tp);
487
488	/ An ACK for P pkts should release at most 2P packets. We do this
489	* in two steps. First, here we deduct the number of lost packets.
490	* Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
491	*/
492	if (rs->losses > `0`)
493	cwnd = max_t(s32, cwnd - rs->losses, `1`);
494
495	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
496	/ Starting 1st round of Recovery, so do packet conservation. /
497	bbr->packet_conservation = `1`;
498	bbr->next_rtt_delivered = tp->delivered; / start round now /
499	/ Cut unused cwnd from app behavior, TSQ, or TSO deferral: /
500	cwnd = tcp_packets_in_flight(tp) + acked;
501	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
502	/ Exiting loss recovery; restore cwnd saved before recovery. /
503	cwnd = max(cwnd, bbr->prior_cwnd);
504	bbr->packet_conservation = `0`;
505	}
506	bbr->prev_ca_state = state;
507
508	if (bbr->packet_conservation) {
509	*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
510	return true; / yes, using packet conservation /
511	}
512	*new_cwnd = cwnd;
513	return false;
514	}
515
516	/ Slow-start up toward target cwnd (if bw estimate is growing, or packet loss*
517	* has drawn us down below target), or snap down to target if we're above it.
518	*/
519	static void bbr_set_cwnd(struct sock sk, const* struct rate_sample *rs,
520	u32 acked, u32 bw, int gain)
521	{
522	struct tcp_sock *tp = tcp_sk(sk);
523	struct bbr *bbr = inet_csk_ca(sk);
524	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = `0`;
525
526	if (!acked)
527	goto done; / no packet fully ACKed; just apply caps /
528
529	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, new_cwnd: &cwnd))
530	goto done;
531
532	target_cwnd = bbr_bdp(sk, bw, gain);
533
534	/ Increment the cwnd to account for excess ACKed data that seems*
535	* due to aggregation (of data and/or ACKs) visible in the ACK stream.
536	*/
537	target_cwnd += bbr_ack_aggregation_cwnd(sk);
538	target_cwnd = bbr_quantization_budget(sk, cwnd: target_cwnd);
539
540	/ If we're below target cwnd, slow start cwnd toward target cwnd. /
541	if (bbr_full_bw_reached(sk)) / only cut cwnd if we filled the pipe /
542	cwnd = min(cwnd + acked, target_cwnd);
543	else if (cwnd < target_cwnd \|\| tp->delivered < TCP_INIT_CWND)
544	cwnd = cwnd + acked;
545	cwnd = max(cwnd, bbr_cwnd_min_target);
546
547	done:
548	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); / apply global cap /
549	if (bbr->mode == BBR_PROBE_RTT) / drain queue, refresh min_rtt /
550	tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
551	}
552
553	/ End cycle phase if it's time and/or we hit the phase's in-flight target. /
554	static bool bbr_is_next_cycle_phase(struct sock *sk,
555	const struct rate_sample *rs)
556	{
557	struct tcp_sock *tp = tcp_sk(sk);
558	struct bbr *bbr = inet_csk_ca(sk);
559	bool is_full_length =
560	tcp_stamp_us_delta(t1: tp->delivered_mstamp, t0: bbr->cycle_mstamp) >
561	bbr->min_rtt_us;
562	u32 inflight, bw;
563
564	/ The pacing_gain of 1.0 paces at the estimated bw to try to fully*
565	* use the pipe without increasing the queue.
566	*/
567	if (bbr->pacing_gain == BBR_UNIT)
568	return is_full_length; / just use wall clock time /
569
570	inflight = bbr_packets_in_net_at_edt(sk, inflight_now: rs->prior_in_flight);
571	bw = bbr_max_bw(sk);
572
573	/ A pacing_gain > 1.0 probes for bw by trying to raise inflight to at*
574	* least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
575	* small (e.g. on a LAN). We do not persist if packets are lost, since
576	* a path with small buffers may not hold that much.
577	*/
578	if (bbr->pacing_gain > BBR_UNIT)
579	return is_full_length &&
580	(rs->losses \|\| / perhaps pacing_gainBDP won't fit /*
581	inflight >= bbr_inflight(sk, bw, gain: bbr->pacing_gain));
582
583	/ A pacing_gain < 1.0 tries to drain extra queue we added if bw*
584	* probing didn't find more bw. If inflight falls to match BDP then we
585	* estimate queue is drained; persisting would underutilize the pipe.
586	*/
587	return is_full_length \|\|
588	inflight <= bbr_inflight(sk, bw, BBR_UNIT);
589	}
590
591	static void bbr_advance_cycle_phase(struct sock *sk)
592	{
593	struct tcp_sock *tp = tcp_sk(sk);
594	struct bbr *bbr = inet_csk_ca(sk);
595
596	bbr->cycle_idx = (bbr->cycle_idx + `1`) & (CYCLE_LEN - `1`);
597	bbr->cycle_mstamp = tp->delivered_mstamp;
598	}
599
600	/ Gain cycling: cycle pacing gain to converge to fair share of available bw. /
601	static void bbr_update_cycle_phase(struct sock *sk,
602	const struct rate_sample *rs)
603	{
604	struct bbr *bbr = inet_csk_ca(sk);
605
606	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
607	bbr_advance_cycle_phase(sk);
608	}
609
610	static void bbr_reset_startup_mode(struct sock *sk)
611	{
612	struct bbr *bbr = inet_csk_ca(sk);
613
614	bbr->mode = BBR_STARTUP;
615	}
616
617	static void bbr_reset_probe_bw_mode(struct sock *sk)
618	{
619	struct bbr *bbr = inet_csk_ca(sk);
620
621	bbr->mode = BBR_PROBE_BW;
622	bbr->cycle_idx = CYCLE_LEN - `1` - get_random_u32_below(ceil: bbr_cycle_rand);
623	bbr_advance_cycle_phase(sk); / flip to next phase of gain cycle /
624	}
625
626	static void bbr_reset_mode(struct sock *sk)
627	{
628	if (!bbr_full_bw_reached(sk))
629	bbr_reset_startup_mode(sk);
630	else
631	bbr_reset_probe_bw_mode(sk);
632	}
633
634	/ Start a new long-term sampling interval. /
635	static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
636	{
637	struct tcp_sock *tp = tcp_sk(sk);
638	struct bbr *bbr = inet_csk_ca(sk);
639
640	bbr->lt_last_stamp = div_u64(dividend: tp->delivered_mstamp, USEC_PER_MSEC);
641	bbr->lt_last_delivered = tp->delivered;
642	bbr->lt_last_lost = tp->lost;
643	bbr->lt_rtt_cnt = `0`;
644	}
645
646	/ Completely reset long-term bandwidth sampling. /
647	static void bbr_reset_lt_bw_sampling(struct sock *sk)
648	{
649	struct bbr *bbr = inet_csk_ca(sk);
650
651	bbr->lt_bw = `0`;
652	bbr->lt_use_bw = `0`;
653	bbr->lt_is_sampling = false;
654	bbr_reset_lt_bw_sampling_interval(sk);
655	}
656
657	/ Long-term bw sampling interval is done. Estimate whether we're policed. /
658	static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
659	{
660	struct bbr *bbr = inet_csk_ca(sk);
661	u32 diff;
662
663	if (bbr->lt_bw) { / do we have bw from a previous interval? /
664	/ Is new bw close to the lt_bw from the previous interval? /
665	diff = abs(bw - bbr->lt_bw);
666	if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) \|\|
667	(bbr_rate_bytes_per_sec(sk, rate: diff, BBR_UNIT) <=
668	bbr_lt_bw_diff)) {
669	/ All criteria are met; estimate we're policed. /
670	bbr->lt_bw = (bw + bbr->lt_bw) >> `1`; / avg 2 intvls /
671	bbr->lt_use_bw = `1`;
672	bbr->pacing_gain = BBR_UNIT; / try to avoid drops /
673	bbr->lt_rtt_cnt = `0`;
674	return;
675	}
676	}
677	bbr->lt_bw = bw;
678	bbr_reset_lt_bw_sampling_interval(sk);
679	}
680
681	/ Token-bucket traffic policers are common (see "An Internet-Wide Analysis of*
682	* Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
683	* explicitly models their policed rate, to reduce unnecessary losses. We
684	* estimate that we're policed if we see 2 consecutive sampling intervals with
685	* consistent throughput and high packet loss. If we think we're being policed,
686	* set lt_bw to the "long-term" average delivery rate from those 2 intervals.
687	*/
688	static void bbr_lt_bw_sampling(struct sock sk, const* struct rate_sample *rs)
689	{
690	struct tcp_sock *tp = tcp_sk(sk);
691	struct bbr *bbr = inet_csk_ca(sk);
692	u32 lost, delivered;
693	u64 bw;
694	u32 t;
695
696	if (bbr->lt_use_bw) { / already using long-term rate, lt_bw? /
697	if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
698	++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
699	bbr_reset_lt_bw_sampling(sk); / stop using lt_bw /
700	bbr_reset_probe_bw_mode(sk); / restart gain cycling /
701	}
702	return;
703	}
704
705	/ Wait for the first loss before sampling, to let the policer exhaust*
706	* its tokens and estimate the steady-state rate allowed by the policer.
707	* Starting samples earlier includes bursts that over-estimate the bw.
708	*/
709	if (!bbr->lt_is_sampling) {
710	if (!rs->losses)
711	return;
712	bbr_reset_lt_bw_sampling_interval(sk);
713	bbr->lt_is_sampling = true;
714	}
715
716	/ To avoid underestimates, reset sampling if we run out of data. /
717	if (rs->is_app_limited) {
718	bbr_reset_lt_bw_sampling(sk);
719	return;
720	}
721
722	if (bbr->round_start)
723	bbr->lt_rtt_cnt++; / count round trips in this interval /
724	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
725	return; / sampling interval needs to be longer /
726	if (bbr->lt_rtt_cnt > `4` * bbr_lt_intvl_min_rtts) {
727	bbr_reset_lt_bw_sampling(sk); / interval is too long /
728	return;
729	}
730
731	/ End sampling interval when a packet is lost, so we estimate the*
732	* policer tokens were exhausted. Stopping the sampling before the
733	* tokens are exhausted under-estimates the policed rate.
734	*/
735	if (!rs->losses)
736	return;
737
738	/ Calculate packets lost and delivered in sampling interval. /
739	lost = tp->lost - bbr->lt_last_lost;
740	delivered = tp->delivered - bbr->lt_last_delivered;
741	/ Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. /
742	if (!delivered \|\| (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
743	return;
744
745	/ Find average delivery rate in this sampling interval. /
746	t = div_u64(dividend: tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
747	if ((s32)t < `1`)
748	return; / interval is less than one ms, so wait /
749	/ Check if can multiply without overflow /
750	if (t >= ~`0U` / USEC_PER_MSEC) {
751	bbr_reset_lt_bw_sampling(sk); / interval too long; reset /
752	return;
753	}
754	t *= USEC_PER_MSEC;
755	bw = (u64)delivered * BW_UNIT;
756	do_div(bw, t);
757	bbr_lt_bw_interval_done(sk, bw);
758	}
759
760	/ Estimate the bandwidth based on how fast packets are delivered /
761	static void bbr_update_bw(struct sock sk, const* struct rate_sample *rs)
762	{
763	struct tcp_sock *tp = tcp_sk(sk);
764	struct bbr *bbr = inet_csk_ca(sk);
765	u64 bw;
766
767	bbr->round_start = `0`;
768	if (rs->delivered < `0` \|\| rs->interval_us <= `0`)
769	return; / Not a valid observation /
770
771	/ See if we've reached the next RTT /
772	if (!before(seq1: rs->prior_delivered, seq2: bbr->next_rtt_delivered)) {
773	bbr->next_rtt_delivered = tp->delivered;
774	bbr->rtt_cnt++;
775	bbr->round_start = `1`;
776	bbr->packet_conservation = `0`;
777	}
778
779	bbr_lt_bw_sampling(sk, rs);
780
781	/ Divide delivered by the interval to find a (lower bound) bottleneck*
782	* bandwidth sample. Delivered is in packets and interval_us in uS and
783	* ratio will be <<1 for most connections. So delivered is first scaled.
784	*/
785	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
786
787	/ If this sample is application-limited, it is likely to have a very*
788	* low delivered count that represents application behavior rather than
789	* the available network rate. Such a sample could drag down estimated
790	* bw, causing needless slow-down. Thus, to continue to send at the
791	* last measured network rate, we filter out app-limited samples unless
792	* they describe the path bw at least as well as our bw model.
793	*
794	* So the goal during app-limited phase is to proceed with the best
795	* network rate no matter how long. We automatically leave this
796	* phase when app writes faster than the network can deliver :)
797	*/
798	if (!rs->is_app_limited \|\| bw >= bbr_max_bw(sk)) {
799	/ Incorporate new sample into our max bw filter. /
800	minmax_running_max(m: &bbr->bw, win: bbr_bw_rtts, t: bbr->rtt_cnt, meas: bw);
801	}
802	}
803
804	/ Estimates the windowed max degree of ack aggregation.*
805	* This is used to provision extra in-flight data to keep sending during
806	* inter-ACK silences.
807	*
808	* Degree of ack aggregation is estimated as extra data acked beyond expected.
809	*
810	* max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
811	* cwnd += max_extra_acked
812	*
813	* Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
814	* Max filter is an approximate sliding window of 5-10 (packet timed) round
815	* trips.
816	*/
817	static void bbr_update_ack_aggregation(struct sock *sk,
818	const struct rate_sample *rs)
819	{
820	u32 epoch_us, expected_acked, extra_acked;
821	struct bbr *bbr = inet_csk_ca(sk);
822	struct tcp_sock *tp = tcp_sk(sk);
823
824	if (!bbr_extra_acked_gain \|\| rs->acked_sacked <= `0` \|\|
825	rs->delivered < `0` \|\| rs->interval_us <= `0`)
826	return;
827
828	if (bbr->round_start) {
829	bbr->extra_acked_win_rtts = min(`0x1F`,
830	bbr->extra_acked_win_rtts + `1`);
831	if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
832	bbr->extra_acked_win_rtts = `0`;
833	bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
834	`0` : `1`;
835	bbr->extra_acked[bbr->extra_acked_win_idx] = `0`;
836	}
837	}
838
839	/ Compute how many packets we expected to be delivered over epoch. /
840	epoch_us = tcp_stamp_us_delta(t1: tp->delivered_mstamp,
841	t0: bbr->ack_epoch_mstamp);
842	expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
843
844	/ Reset the aggregation epoch if ACK rate is below expected rate or*
845	* significantly large no. of ack received since epoch (potentially
846	* quite old epoch).
847	*/
848	if (bbr->ack_epoch_acked <= expected_acked \|\|
849	(bbr->ack_epoch_acked + rs->acked_sacked >=
850	bbr_ack_epoch_acked_reset_thresh)) {
851	bbr->ack_epoch_acked = `0`;
852	bbr->ack_epoch_mstamp = tp->delivered_mstamp;
853	expected_acked = `0`;
854	}
855
856	/ Compute excess data delivered, beyond what was expected. /
857	bbr->ack_epoch_acked = min_t(u32, `0xFFFFF`,
858	bbr->ack_epoch_acked + rs->acked_sacked);
859	extra_acked = bbr->ack_epoch_acked - expected_acked;
860	extra_acked = min(extra_acked, tcp_snd_cwnd(tp));
861	if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
862	bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
863	}
864
865	/ Estimate when the pipe is full, using the change in delivery rate: BBR*
866	* estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
867	* at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
868	* rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
869	* higher rwin, 3: we get higher delivery rate samples. Or transient
870	* cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
871	* design goal, but uses delay and inter-ACK spacing instead of bandwidth.
872	*/
873	static void bbr_check_full_bw_reached(struct sock *sk,
874	const struct rate_sample *rs)
875	{
876	struct bbr *bbr = inet_csk_ca(sk);
877	u32 bw_thresh;
878
879	if (bbr_full_bw_reached(sk) \|\| !bbr->round_start \|\| rs->is_app_limited)
880	return;
881
882	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
883	if (bbr_max_bw(sk) >= bw_thresh) {
884	bbr->full_bw = bbr_max_bw(sk);
885	bbr->full_bw_cnt = `0`;
886	return;
887	}
888	++bbr->full_bw_cnt;
889	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
890	}
891
892	/ If pipe is probably full, drain the queue and then enter steady-state. /
893	static void bbr_check_drain(struct sock sk, const* struct rate_sample *rs)
894	{
895	struct bbr *bbr = inet_csk_ca(sk);
896
897	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
898	bbr->mode = BBR_DRAIN; / drain queue we created /
899	tcp_sk(sk)->snd_ssthresh =
900	bbr_inflight(sk, bw: bbr_max_bw(sk), BBR_UNIT);
901	} / fall through to check if in-flight is already small: /
902	if (bbr->mode == BBR_DRAIN &&
903	bbr_packets_in_net_at_edt(sk, inflight_now: tcp_packets_in_flight(tcp_sk(sk))) <=
904	bbr_inflight(sk, bw: bbr_max_bw(sk), BBR_UNIT))
905	bbr_reset_probe_bw_mode(sk); / we estimate queue is drained /
906	}
907
908	static void bbr_check_probe_rtt_done(struct sock *sk)
909	{
910	struct tcp_sock *tp = tcp_sk(sk);
911	struct bbr *bbr = inet_csk_ca(sk);
912
913	if (!(bbr->probe_rtt_done_stamp &&
914	after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
915	return;
916
917	bbr->min_rtt_stamp = tcp_jiffies32; / wait a while until PROBE_RTT /
918	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
919	bbr_reset_mode(sk);
920	}
921
922	/ The goal of PROBE_RTT mode is to have BBR flows cooperatively and*
923	* periodically drain the bottleneck queue, to converge to measure the true
924	* min_rtt (unloaded propagation delay). This allows the flows to keep queues
925	* small (reducing queuing delay and packet loss) and achieve fairness among
926	* BBR flows.
927	*
928	* The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
929	* we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
930	* After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
931	* round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
932	* re-enter the previous mode. BBR uses 200ms to approximately bound the
933	* performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
934	*
935	* Note that flows need only pay 2% if they are busy sending over the last 10
936	* seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
937	* natural silences or low-rate periods within 10 seconds where the rate is low
938	* enough for long enough to drain its queue in the bottleneck. We pick up
939	* these min RTT measurements opportunistically with our min_rtt filter. :-)
940	*/
941	static void bbr_update_min_rtt(struct sock sk, const* struct rate_sample *rs)
942	{
943	struct tcp_sock *tp = tcp_sk(sk);
944	struct bbr *bbr = inet_csk_ca(sk);
945	bool filter_expired;
946
947	/ Track min RTT seen in the min_rtt_win_sec filter window: /
948	filter_expired = after(tcp_jiffies32,
949	bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
950	if (rs->rtt_us >= `0` &&
951	(rs->rtt_us < bbr->min_rtt_us \|\|
952	(filter_expired && !rs->is_ack_delayed))) {
953	bbr->min_rtt_us = rs->rtt_us;
954	bbr->min_rtt_stamp = tcp_jiffies32;
955	}
956
957	if (bbr_probe_rtt_mode_ms > `0` && filter_expired &&
958	!bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
959	bbr->mode = BBR_PROBE_RTT; / dip, drain queue /
960	bbr_save_cwnd(sk); / note cwnd so we can restore it /
961	bbr->probe_rtt_done_stamp = `0`;
962	}
963
964	if (bbr->mode == BBR_PROBE_RTT) {
965	/ Ignore low rate samples during this mode. /
966	tp->app_limited =
967	(tp->delivered + tcp_packets_in_flight(tp)) ? : `1`;
968	/ Maintain min packets in flight for max(200 ms, 1 round). /
969	if (!bbr->probe_rtt_done_stamp &&
970	tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
971	bbr->probe_rtt_done_stamp = tcp_jiffies32 +
972	msecs_to_jiffies(m: bbr_probe_rtt_mode_ms);
973	bbr->probe_rtt_round_done = `0`;
974	bbr->next_rtt_delivered = tp->delivered;
975	} else if (bbr->probe_rtt_done_stamp) {
976	if (bbr->round_start)
977	bbr->probe_rtt_round_done = `1`;
978	if (bbr->probe_rtt_round_done)
979	bbr_check_probe_rtt_done(sk);
980	}
981	}
982	/ Restart after idle ends only once we process a new S/ACK for data /
983	if (rs->delivered > `0`)
984	bbr->idle_restart = `0`;
985	}
986
987	static void bbr_update_gains(struct sock *sk)
988	{
989	struct bbr *bbr = inet_csk_ca(sk);
990
991	switch (bbr->mode) {
992	case BBR_STARTUP:
993	bbr->pacing_gain = bbr_high_gain;
994	bbr->cwnd_gain = bbr_high_gain;
995	break;
996	case BBR_DRAIN:
997	bbr->pacing_gain = bbr_drain_gain; / slow, to drain /
998	bbr->cwnd_gain = bbr_high_gain; / keep cwnd /
999	break;
1000	case BBR_PROBE_BW:
1001	bbr->pacing_gain = (bbr->lt_use_bw ?
1002	BBR_UNIT :
1003	bbr_pacing_gain[bbr->cycle_idx]);
1004	bbr->cwnd_gain = bbr_cwnd_gain;
1005	break;
1006	case BBR_PROBE_RTT:
1007	bbr->pacing_gain = BBR_UNIT;
1008	bbr->cwnd_gain = BBR_UNIT;
1009	break;
1010	default:
1011	WARN_ONCE(`1`, "BBR bad mode: %u\n", bbr->mode);
1012	break;
1013	}
1014	}
1015
1016	static void bbr_update_model(struct sock sk, const* struct rate_sample *rs)
1017	{
1018	bbr_update_bw(sk, rs);
1019	bbr_update_ack_aggregation(sk, rs);
1020	bbr_update_cycle_phase(sk, rs);
1021	bbr_check_full_bw_reached(sk, rs);
1022	bbr_check_drain(sk, rs);
1023	bbr_update_min_rtt(sk, rs);
1024	bbr_update_gains(sk);
1025	}
1026
1027	__bpf_kfunc static void bbr_main(struct sock sk, const* struct rate_sample *rs)
1028	{
1029	struct bbr *bbr = inet_csk_ca(sk);
1030	u32 bw;
1031
1032	bbr_update_model(sk, rs);
1033
1034	bw = bbr_bw(sk);
1035	bbr_set_pacing_rate(sk, bw, gain: bbr->pacing_gain);
1036	bbr_set_cwnd(sk, rs, acked: rs->acked_sacked, bw, gain: bbr->cwnd_gain);
1037	}
1038
1039	__bpf_kfunc static void bbr_init(struct sock *sk)
1040	{
1041	struct tcp_sock *tp = tcp_sk(sk);
1042	struct bbr *bbr = inet_csk_ca(sk);
1043
1044	bbr->prior_cwnd = `0`;
1045	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1046	bbr->rtt_cnt = `0`;
1047	bbr->next_rtt_delivered = tp->delivered;
1048	bbr->prev_ca_state = TCP_CA_Open;
1049	bbr->packet_conservation = `0`;
1050
1051	bbr->probe_rtt_done_stamp = `0`;
1052	bbr->probe_rtt_round_done = `0`;
1053	bbr->min_rtt_us = tcp_min_rtt(tp);
1054	bbr->min_rtt_stamp = tcp_jiffies32;
1055
1056	minmax_reset(m: &bbr->bw, t: bbr->rtt_cnt, meas: `0`); / init max bw to 0 /
1057
1058	bbr->has_seen_rtt = `0`;
1059	bbr_init_pacing_rate_from_rtt(sk);
1060
1061	bbr->round_start = `0`;
1062	bbr->idle_restart = `0`;
1063	bbr->full_bw_reached = `0`;
1064	bbr->full_bw = `0`;
1065	bbr->full_bw_cnt = `0`;
1066	bbr->cycle_mstamp = `0`;
1067	bbr->cycle_idx = `0`;
1068	bbr_reset_lt_bw_sampling(sk);
1069	bbr_reset_startup_mode(sk);
1070
1071	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
1072	bbr->ack_epoch_acked = `0`;
1073	bbr->extra_acked_win_rtts = `0`;
1074	bbr->extra_acked_win_idx = `0`;
1075	bbr->extra_acked[`0`] = `0`;
1076	bbr->extra_acked[`1`] = `0`;
1077
1078	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
1079	}
1080
1081	__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
1082	{
1083	/ Provision 3 * cwnd since BBR may slow-start even during recovery. /
1084	return `3`;
1085	}
1086
1087	/ In theory BBR does not need to undo the cwnd since it does not*
1088	* always reduce cwnd on losses (see bbr_main()). Keep it for now.
1089	*/
1090	__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
1091	{
1092	struct bbr *bbr = inet_csk_ca(sk);
1093
1094	bbr->full_bw = `0`; / spurious slow-down; reset full pipe detection /
1095	bbr->full_bw_cnt = `0`;
1096	bbr_reset_lt_bw_sampling(sk);
1097	return tcp_snd_cwnd(tcp_sk(sk));
1098	}
1099
1100	/ Entering loss recovery, so save cwnd for when we exit or undo recovery. /
1101	__bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
1102	{
1103	bbr_save_cwnd(sk);
1104	return tcp_sk(sk)->snd_ssthresh;
1105	}
1106
1107	static size_t bbr_get_info(struct sock sk, u32 ext, int* *attr,
1108	union tcp_cc_info *info)
1109	{
1110	if (ext & (`1` << (INET_DIAG_BBRINFO - `1`)) \|\|
1111	ext & (`1` << (INET_DIAG_VEGASINFO - `1`))) {
1112	struct tcp_sock *tp = tcp_sk(sk);
1113	struct bbr *bbr = inet_csk_ca(sk);
1114	u64 bw = bbr_bw(sk);
1115
1116	bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
1117	memset(&info->bbr, `0`, sizeof(info->bbr));
1118	info->bbr.bbr_bw_lo = (u32)bw;
1119	info->bbr.bbr_bw_hi = (u32)(bw >> `32`);
1120	info->bbr.bbr_min_rtt = bbr->min_rtt_us;
1121	info->bbr.bbr_pacing_gain = bbr->pacing_gain;
1122	info->bbr.bbr_cwnd_gain = bbr->cwnd_gain;
1123	*attr = INET_DIAG_BBRINFO;
1124	return sizeof(info->bbr);
1125	}
1126	return `0`;
1127	}
1128
1129	__bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
1130	{
1131	struct bbr *bbr = inet_csk_ca(sk);
1132
1133	if (new_state == TCP_CA_Loss) {
1134	struct rate_sample rs = { .losses = `1` };
1135
1136	bbr->prev_ca_state = TCP_CA_Loss;
1137	bbr->full_bw = `0`;
1138	bbr->round_start = `1`; / treat RTO like end of a round /
1139	bbr_lt_bw_sampling(sk, rs: &rs);
1140	}
1141	}
1142
1143	static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
1144	.flags = TCP_CONG_NON_RESTRICTED,
1145	.name = "bbr",
1146	.owner = THIS_MODULE,
1147	.init = bbr_init,
1148	.cong_control = bbr_main,
1149	.sndbuf_expand = bbr_sndbuf_expand,
1150	.undo_cwnd = bbr_undo_cwnd,
1151	.cwnd_event = bbr_cwnd_event,
1152	.ssthresh = bbr_ssthresh,
1153	.min_tso_segs = bbr_min_tso_segs,
1154	.get_info = bbr_get_info,
1155	.set_state = bbr_set_state,
1156	};
1157
1158	BTF_SET8_START(tcp_bbr_check_kfunc_ids)
1159	#ifdef CONFIG_X86
1160	#ifdef CONFIG_DYNAMIC_FTRACE
1161	BTF_ID_FLAGS(func, bbr_init)
1162	BTF_ID_FLAGS(func, bbr_main)
1163	BTF_ID_FLAGS(func, bbr_sndbuf_expand)
1164	BTF_ID_FLAGS(func, bbr_undo_cwnd)
1165	BTF_ID_FLAGS(func, bbr_cwnd_event)
1166	BTF_ID_FLAGS(func, bbr_ssthresh)
1167	BTF_ID_FLAGS(func, bbr_min_tso_segs)
1168	BTF_ID_FLAGS(func, bbr_set_state)
1169	#endif
1170	#endif
1171	BTF_SET8_END(tcp_bbr_check_kfunc_ids)
1172
1173	static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
1174	.owner = THIS_MODULE,
1175	.set = &tcp_bbr_check_kfunc_ids,
1176	};
1177
1178	static int __init bbr_register(void)
1179	{
1180	int ret;
1181
1182	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
1183
1184	ret = register_btf_kfunc_id_set(prog_type: BPF_PROG_TYPE_STRUCT_OPS, s: &tcp_bbr_kfunc_set);
1185	if (ret < `0`)
1186	return ret;
1187	return tcp_register_congestion_control(type: &tcp_bbr_cong_ops);
1188	}
1189
1190	static void __exit bbr_unregister(void)
1191	{
1192	tcp_unregister_congestion_control(type: &tcp_bbr_cong_ops);
1193	}
1194
1195	module_init(bbr_register);
1196	module_exit(bbr_unregister);
1197
1198	MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
1199	MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
1200	MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
1201	MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
1202	MODULE_LICENSE("Dual BSD/GPL");
1203	MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
1204

source code of linux/net/ipv4/tcp_bbr.c