1 | /* Protective Load Balancing (PLB) |
2 | * |
3 | * PLB was designed to reduce link load imbalance across datacenter |
4 | * switches. PLB is a host-based optimization; it leverages congestion |
5 | * signals from the transport layer to randomly change the path of the |
6 | * connection experiencing sustained congestion. PLB prefers to repath |
7 | * after idle periods to minimize packet reordering. It repaths by |
8 | * changing the IPv6 Flow Label on the packets of a connection, which |
9 | * datacenter switches include as part of ECMP/WCMP hashing. |
10 | * |
11 | * PLB is described in detail in: |
12 | * |
13 | * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu, |
14 | * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson, |
15 | * David Wetherall,Abdul Kabbani: |
16 | * "PLB: Congestion Signals are Simple and Effective for |
17 | * Network Load Balancing" |
18 | * In ACM SIGCOMM 2022, Amsterdam Netherlands. |
19 | * |
20 | */ |
21 | |
22 | #include <net/tcp.h> |
23 | |
24 | /* Called once per round-trip to update PLB state for a connection. */ |
25 | void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, |
26 | const int cong_ratio) |
27 | { |
28 | struct net *net = sock_net(sk); |
29 | |
30 | if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) |
31 | return; |
32 | |
33 | if (cong_ratio >= 0) { |
34 | if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh)) |
35 | plb->consec_cong_rounds = 0; |
36 | else if (plb->consec_cong_rounds < |
37 | READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds)) |
38 | plb->consec_cong_rounds++; |
39 | } |
40 | } |
41 | EXPORT_SYMBOL_GPL(tcp_plb_update_state); |
42 | |
43 | /* Check whether recent congestion has been persistent enough to warrant |
44 | * a load balancing decision that switches the connection to another path. |
45 | */ |
46 | void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) |
47 | { |
48 | struct net *net = sock_net(sk); |
49 | u32 max_suspend; |
50 | bool forced_rehash = false, idle_rehash = false; |
51 | |
52 | if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) |
53 | return; |
54 | |
55 | forced_rehash = plb->consec_cong_rounds >= |
56 | READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds); |
57 | /* If sender goes idle then we check whether to rehash. */ |
58 | idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) && |
59 | !tcp_sk(sk)->packets_out && |
60 | plb->consec_cong_rounds >= |
61 | READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds); |
62 | |
63 | if (!forced_rehash && !idle_rehash) |
64 | return; |
65 | |
66 | /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for |
67 | * cases where the max suspension end is before the actual suspension |
68 | * end. We clear pause_until to 0 to indicate there is no recent |
69 | * RTO event that constrains PLB rehashing. |
70 | */ |
71 | max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; |
72 | if (plb->pause_until && |
73 | (!before(tcp_jiffies32, seq2: plb->pause_until) || |
74 | before(tcp_jiffies32 + max_suspend, seq2: plb->pause_until))) |
75 | plb->pause_until = 0; |
76 | |
77 | if (plb->pause_until) |
78 | return; |
79 | |
80 | sk_rethink_txhash(sk); |
81 | plb->consec_cong_rounds = 0; |
82 | tcp_sk(sk)->plb_rehash++; |
83 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); |
84 | } |
85 | EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); |
86 | |
87 | /* Upon RTO, disallow load balancing for a while, to avoid having load |
88 | * balancing decisions switch traffic to a black-holed path that was |
89 | * previously avoided with a sk_rethink_txhash() call at RTO time. |
90 | */ |
91 | void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb) |
92 | { |
93 | struct net *net = sock_net(sk); |
94 | u32 pause; |
95 | |
96 | if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) |
97 | return; |
98 | |
99 | pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; |
100 | pause += get_random_u32_below(ceil: pause); |
101 | plb->pause_until = tcp_jiffies32 + pause; |
102 | |
103 | /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call |
104 | * that may switch this connection to a path with completely different |
105 | * congestion characteristics. |
106 | */ |
107 | plb->consec_cong_rounds = 0; |
108 | } |
109 | EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto); |
110 | |