1 | /* |
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
3 | * operating system. INET is implemented using the BSD Socket |
4 | * interface as the means of communication with the user level. |
5 | * |
6 | * Generic INET transport hashtables |
7 | * |
8 | * Authors: Lotsa people, from code originally in tcp |
9 | * |
10 | * This program is free software; you can redistribute it and/or |
11 | * modify it under the terms of the GNU General Public License |
12 | * as published by the Free Software Foundation; either version |
13 | * 2 of the License, or (at your option) any later version. |
14 | */ |
15 | |
16 | #include <linux/module.h> |
17 | #include <linux/random.h> |
18 | #include <linux/sched.h> |
19 | #include <linux/slab.h> |
20 | #include <linux/wait.h> |
21 | #include <linux/vmalloc.h> |
22 | #include <linux/memblock.h> |
23 | |
24 | #include <net/addrconf.h> |
25 | #include <net/inet_connection_sock.h> |
26 | #include <net/inet_hashtables.h> |
27 | #include <net/secure_seq.h> |
28 | #include <net/ip.h> |
29 | #include <net/tcp.h> |
30 | #include <net/sock_reuseport.h> |
31 | |
32 | static u32 inet_ehashfn(const struct net *net, const __be32 laddr, |
33 | const __u16 lport, const __be32 faddr, |
34 | const __be16 fport) |
35 | { |
36 | static u32 inet_ehash_secret __read_mostly; |
37 | |
38 | net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); |
39 | |
40 | return __inet_ehashfn(laddr, lport, faddr, fport, |
41 | inet_ehash_secret + net_hash_mix(net)); |
42 | } |
43 | |
44 | /* This function handles inet_sock, but also timewait and request sockets |
45 | * for IPv4/IPv6. |
46 | */ |
47 | static u32 sk_ehashfn(const struct sock *sk) |
48 | { |
49 | #if IS_ENABLED(CONFIG_IPV6) |
50 | if (sk->sk_family == AF_INET6 && |
51 | !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) |
52 | return inet6_ehashfn(sock_net(sk), |
53 | &sk->sk_v6_rcv_saddr, sk->sk_num, |
54 | &sk->sk_v6_daddr, sk->sk_dport); |
55 | #endif |
56 | return inet_ehashfn(sock_net(sk), |
57 | sk->sk_rcv_saddr, sk->sk_num, |
58 | sk->sk_daddr, sk->sk_dport); |
59 | } |
60 | |
61 | /* |
62 | * Allocate and initialize a new local port bind bucket. |
63 | * The bindhash mutex for snum's hash chain must be held here. |
64 | */ |
65 | struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, |
66 | struct net *net, |
67 | struct inet_bind_hashbucket *head, |
68 | const unsigned short snum, |
69 | int l3mdev) |
70 | { |
71 | struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); |
72 | |
73 | if (tb) { |
74 | write_pnet(&tb->ib_net, net); |
75 | tb->l3mdev = l3mdev; |
76 | tb->port = snum; |
77 | tb->fastreuse = 0; |
78 | tb->fastreuseport = 0; |
79 | INIT_HLIST_HEAD(&tb->owners); |
80 | hlist_add_head(&tb->node, &head->chain); |
81 | } |
82 | return tb; |
83 | } |
84 | |
85 | /* |
86 | * Caller must hold hashbucket lock for this tb with local BH disabled |
87 | */ |
88 | void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) |
89 | { |
90 | if (hlist_empty(&tb->owners)) { |
91 | __hlist_del(&tb->node); |
92 | kmem_cache_free(cachep, tb); |
93 | } |
94 | } |
95 | |
96 | void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, |
97 | const unsigned short snum) |
98 | { |
99 | inet_sk(sk)->inet_num = snum; |
100 | sk_add_bind_node(sk, &tb->owners); |
101 | inet_csk(sk)->icsk_bind_hash = tb; |
102 | } |
103 | |
104 | /* |
105 | * Get rid of any references to a local port held by the given sock. |
106 | */ |
107 | static void __inet_put_port(struct sock *sk) |
108 | { |
109 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
110 | const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, |
111 | hashinfo->bhash_size); |
112 | struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; |
113 | struct inet_bind_bucket *tb; |
114 | |
115 | spin_lock(&head->lock); |
116 | tb = inet_csk(sk)->icsk_bind_hash; |
117 | __sk_del_bind_node(sk); |
118 | inet_csk(sk)->icsk_bind_hash = NULL; |
119 | inet_sk(sk)->inet_num = 0; |
120 | inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); |
121 | spin_unlock(&head->lock); |
122 | } |
123 | |
124 | void inet_put_port(struct sock *sk) |
125 | { |
126 | local_bh_disable(); |
127 | __inet_put_port(sk); |
128 | local_bh_enable(); |
129 | } |
130 | EXPORT_SYMBOL(inet_put_port); |
131 | |
132 | int __inet_inherit_port(const struct sock *sk, struct sock *child) |
133 | { |
134 | struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; |
135 | unsigned short port = inet_sk(child)->inet_num; |
136 | const int bhash = inet_bhashfn(sock_net(sk), port, |
137 | table->bhash_size); |
138 | struct inet_bind_hashbucket *head = &table->bhash[bhash]; |
139 | struct inet_bind_bucket *tb; |
140 | int l3mdev; |
141 | |
142 | spin_lock(&head->lock); |
143 | tb = inet_csk(sk)->icsk_bind_hash; |
144 | if (unlikely(!tb)) { |
145 | spin_unlock(&head->lock); |
146 | return -ENOENT; |
147 | } |
148 | if (tb->port != port) { |
149 | l3mdev = inet_sk_bound_l3mdev(sk); |
150 | |
151 | /* NOTE: using tproxy and redirecting skbs to a proxy |
152 | * on a different listener port breaks the assumption |
153 | * that the listener socket's icsk_bind_hash is the same |
154 | * as that of the child socket. We have to look up or |
155 | * create a new bind bucket for the child here. */ |
156 | inet_bind_bucket_for_each(tb, &head->chain) { |
157 | if (net_eq(ib_net(tb), sock_net(sk)) && |
158 | tb->l3mdev == l3mdev && tb->port == port) |
159 | break; |
160 | } |
161 | if (!tb) { |
162 | tb = inet_bind_bucket_create(table->bind_bucket_cachep, |
163 | sock_net(sk), head, port, |
164 | l3mdev); |
165 | if (!tb) { |
166 | spin_unlock(&head->lock); |
167 | return -ENOMEM; |
168 | } |
169 | } |
170 | } |
171 | inet_bind_hash(child, tb, port); |
172 | spin_unlock(&head->lock); |
173 | |
174 | return 0; |
175 | } |
176 | EXPORT_SYMBOL_GPL(__inet_inherit_port); |
177 | |
178 | static struct inet_listen_hashbucket * |
179 | inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) |
180 | { |
181 | u32 hash; |
182 | |
183 | #if IS_ENABLED(CONFIG_IPV6) |
184 | if (sk->sk_family == AF_INET6) |
185 | hash = ipv6_portaddr_hash(sock_net(sk), |
186 | &sk->sk_v6_rcv_saddr, |
187 | inet_sk(sk)->inet_num); |
188 | else |
189 | #endif |
190 | hash = ipv4_portaddr_hash(sock_net(sk), |
191 | inet_sk(sk)->inet_rcv_saddr, |
192 | inet_sk(sk)->inet_num); |
193 | return inet_lhash2_bucket(h, hash); |
194 | } |
195 | |
196 | static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) |
197 | { |
198 | struct inet_listen_hashbucket *ilb2; |
199 | |
200 | if (!h->lhash2) |
201 | return; |
202 | |
203 | ilb2 = inet_lhash2_bucket_sk(h, sk); |
204 | |
205 | spin_lock(&ilb2->lock); |
206 | if (sk->sk_reuseport && sk->sk_family == AF_INET6) |
207 | hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, |
208 | &ilb2->head); |
209 | else |
210 | hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, |
211 | &ilb2->head); |
212 | ilb2->count++; |
213 | spin_unlock(&ilb2->lock); |
214 | } |
215 | |
216 | static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) |
217 | { |
218 | struct inet_listen_hashbucket *ilb2; |
219 | |
220 | if (!h->lhash2 || |
221 | WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) |
222 | return; |
223 | |
224 | ilb2 = inet_lhash2_bucket_sk(h, sk); |
225 | |
226 | spin_lock(&ilb2->lock); |
227 | hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); |
228 | ilb2->count--; |
229 | spin_unlock(&ilb2->lock); |
230 | } |
231 | |
232 | static inline int compute_score(struct sock *sk, struct net *net, |
233 | const unsigned short hnum, const __be32 daddr, |
234 | const int dif, const int sdif, bool exact_dif) |
235 | { |
236 | int score = -1; |
237 | |
238 | if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && |
239 | !ipv6_only_sock(sk)) { |
240 | if (sk->sk_rcv_saddr != daddr) |
241 | return -1; |
242 | |
243 | if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) |
244 | return -1; |
245 | |
246 | score = sk->sk_family == PF_INET ? 2 : 1; |
247 | if (sk->sk_incoming_cpu == raw_smp_processor_id()) |
248 | score++; |
249 | } |
250 | return score; |
251 | } |
252 | |
253 | /* |
254 | * Here are some nice properties to exploit here. The BSD API |
255 | * does not allow a listening sock to specify the remote port nor the |
256 | * remote address for the connection. So always assume those are both |
257 | * wildcarded during the search since they can never be otherwise. |
258 | */ |
259 | |
260 | /* called with rcu_read_lock() : No refcount taken on the socket */ |
261 | static struct sock *inet_lhash2_lookup(struct net *net, |
262 | struct inet_listen_hashbucket *ilb2, |
263 | struct sk_buff *skb, int doff, |
264 | const __be32 saddr, __be16 sport, |
265 | const __be32 daddr, const unsigned short hnum, |
266 | const int dif, const int sdif) |
267 | { |
268 | bool exact_dif = inet_exact_dif_match(net, skb); |
269 | struct inet_connection_sock *icsk; |
270 | struct sock *sk, *result = NULL; |
271 | int score, hiscore = 0; |
272 | u32 phash = 0; |
273 | |
274 | inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { |
275 | sk = (struct sock *)icsk; |
276 | score = compute_score(sk, net, hnum, daddr, |
277 | dif, sdif, exact_dif); |
278 | if (score > hiscore) { |
279 | if (sk->sk_reuseport) { |
280 | phash = inet_ehashfn(net, daddr, hnum, |
281 | saddr, sport); |
282 | result = reuseport_select_sock(sk, phash, |
283 | skb, doff); |
284 | if (result) |
285 | return result; |
286 | } |
287 | result = sk; |
288 | hiscore = score; |
289 | } |
290 | } |
291 | |
292 | return result; |
293 | } |
294 | |
295 | struct sock *__inet_lookup_listener(struct net *net, |
296 | struct inet_hashinfo *hashinfo, |
297 | struct sk_buff *skb, int doff, |
298 | const __be32 saddr, __be16 sport, |
299 | const __be32 daddr, const unsigned short hnum, |
300 | const int dif, const int sdif) |
301 | { |
302 | struct inet_listen_hashbucket *ilb2; |
303 | struct sock *result = NULL; |
304 | unsigned int hash2; |
305 | |
306 | hash2 = ipv4_portaddr_hash(net, daddr, hnum); |
307 | ilb2 = inet_lhash2_bucket(hashinfo, hash2); |
308 | |
309 | result = inet_lhash2_lookup(net, ilb2, skb, doff, |
310 | saddr, sport, daddr, hnum, |
311 | dif, sdif); |
312 | if (result) |
313 | goto done; |
314 | |
315 | /* Lookup lhash2 with INADDR_ANY */ |
316 | hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); |
317 | ilb2 = inet_lhash2_bucket(hashinfo, hash2); |
318 | |
319 | result = inet_lhash2_lookup(net, ilb2, skb, doff, |
320 | saddr, sport, htonl(INADDR_ANY), hnum, |
321 | dif, sdif); |
322 | done: |
323 | if (unlikely(IS_ERR(result))) |
324 | return NULL; |
325 | return result; |
326 | } |
327 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); |
328 | |
329 | /* All sockets share common refcount, but have different destructors */ |
330 | void sock_gen_put(struct sock *sk) |
331 | { |
332 | if (!refcount_dec_and_test(&sk->sk_refcnt)) |
333 | return; |
334 | |
335 | if (sk->sk_state == TCP_TIME_WAIT) |
336 | inet_twsk_free(inet_twsk(sk)); |
337 | else if (sk->sk_state == TCP_NEW_SYN_RECV) |
338 | reqsk_free(inet_reqsk(sk)); |
339 | else |
340 | sk_free(sk); |
341 | } |
342 | EXPORT_SYMBOL_GPL(sock_gen_put); |
343 | |
344 | void sock_edemux(struct sk_buff *skb) |
345 | { |
346 | sock_gen_put(skb->sk); |
347 | } |
348 | EXPORT_SYMBOL(sock_edemux); |
349 | |
350 | struct sock *__inet_lookup_established(struct net *net, |
351 | struct inet_hashinfo *hashinfo, |
352 | const __be32 saddr, const __be16 sport, |
353 | const __be32 daddr, const u16 hnum, |
354 | const int dif, const int sdif) |
355 | { |
356 | INET_ADDR_COOKIE(acookie, saddr, daddr); |
357 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); |
358 | struct sock *sk; |
359 | const struct hlist_nulls_node *node; |
360 | /* Optimize here for direct hit, only listening connections can |
361 | * have wildcards anyways. |
362 | */ |
363 | unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); |
364 | unsigned int slot = hash & hashinfo->ehash_mask; |
365 | struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; |
366 | |
367 | begin: |
368 | sk_nulls_for_each_rcu(sk, node, &head->chain) { |
369 | if (sk->sk_hash != hash) |
370 | continue; |
371 | if (likely(INET_MATCH(sk, net, acookie, |
372 | saddr, daddr, ports, dif, sdif))) { |
373 | if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) |
374 | goto out; |
375 | if (unlikely(!INET_MATCH(sk, net, acookie, |
376 | saddr, daddr, ports, |
377 | dif, sdif))) { |
378 | sock_gen_put(sk); |
379 | goto begin; |
380 | } |
381 | goto found; |
382 | } |
383 | } |
384 | /* |
385 | * if the nulls value we got at the end of this lookup is |
386 | * not the expected one, we must restart lookup. |
387 | * We probably met an item that was moved to another chain. |
388 | */ |
389 | if (get_nulls_value(node) != slot) |
390 | goto begin; |
391 | out: |
392 | sk = NULL; |
393 | found: |
394 | return sk; |
395 | } |
396 | EXPORT_SYMBOL_GPL(__inet_lookup_established); |
397 | |
398 | /* called with local bh disabled */ |
399 | static int __inet_check_established(struct inet_timewait_death_row *death_row, |
400 | struct sock *sk, __u16 lport, |
401 | struct inet_timewait_sock **twp) |
402 | { |
403 | struct inet_hashinfo *hinfo = death_row->hashinfo; |
404 | struct inet_sock *inet = inet_sk(sk); |
405 | __be32 daddr = inet->inet_rcv_saddr; |
406 | __be32 saddr = inet->inet_daddr; |
407 | int dif = sk->sk_bound_dev_if; |
408 | struct net *net = sock_net(sk); |
409 | int sdif = l3mdev_master_ifindex_by_index(net, dif); |
410 | INET_ADDR_COOKIE(acookie, saddr, daddr); |
411 | const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); |
412 | unsigned int hash = inet_ehashfn(net, daddr, lport, |
413 | saddr, inet->inet_dport); |
414 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); |
415 | spinlock_t *lock = inet_ehash_lockp(hinfo, hash); |
416 | struct sock *sk2; |
417 | const struct hlist_nulls_node *node; |
418 | struct inet_timewait_sock *tw = NULL; |
419 | |
420 | spin_lock(lock); |
421 | |
422 | sk_nulls_for_each(sk2, node, &head->chain) { |
423 | if (sk2->sk_hash != hash) |
424 | continue; |
425 | |
426 | if (likely(INET_MATCH(sk2, net, acookie, |
427 | saddr, daddr, ports, dif, sdif))) { |
428 | if (sk2->sk_state == TCP_TIME_WAIT) { |
429 | tw = inet_twsk(sk2); |
430 | if (twsk_unique(sk, sk2, twp)) |
431 | break; |
432 | } |
433 | goto not_unique; |
434 | } |
435 | } |
436 | |
437 | /* Must record num and sport now. Otherwise we will see |
438 | * in hash table socket with a funny identity. |
439 | */ |
440 | inet->inet_num = lport; |
441 | inet->inet_sport = htons(lport); |
442 | sk->sk_hash = hash; |
443 | WARN_ON(!sk_unhashed(sk)); |
444 | __sk_nulls_add_node_rcu(sk, &head->chain); |
445 | if (tw) { |
446 | sk_nulls_del_node_init_rcu((struct sock *)tw); |
447 | __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); |
448 | } |
449 | spin_unlock(lock); |
450 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
451 | |
452 | if (twp) { |
453 | *twp = tw; |
454 | } else if (tw) { |
455 | /* Silly. Should hash-dance instead... */ |
456 | inet_twsk_deschedule_put(tw); |
457 | } |
458 | return 0; |
459 | |
460 | not_unique: |
461 | spin_unlock(lock); |
462 | return -EADDRNOTAVAIL; |
463 | } |
464 | |
465 | static u32 inet_sk_port_offset(const struct sock *sk) |
466 | { |
467 | const struct inet_sock *inet = inet_sk(sk); |
468 | |
469 | return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, |
470 | inet->inet_daddr, |
471 | inet->inet_dport); |
472 | } |
473 | |
474 | /* insert a socket into ehash, and eventually remove another one |
475 | * (The another one can be a SYN_RECV or TIMEWAIT |
476 | */ |
477 | bool inet_ehash_insert(struct sock *sk, struct sock *osk) |
478 | { |
479 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
480 | struct hlist_nulls_head *list; |
481 | struct inet_ehash_bucket *head; |
482 | spinlock_t *lock; |
483 | bool ret = true; |
484 | |
485 | WARN_ON_ONCE(!sk_unhashed(sk)); |
486 | |
487 | sk->sk_hash = sk_ehashfn(sk); |
488 | head = inet_ehash_bucket(hashinfo, sk->sk_hash); |
489 | list = &head->chain; |
490 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
491 | |
492 | spin_lock(lock); |
493 | if (osk) { |
494 | WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); |
495 | ret = sk_nulls_del_node_init_rcu(osk); |
496 | } |
497 | if (ret) |
498 | __sk_nulls_add_node_rcu(sk, list); |
499 | spin_unlock(lock); |
500 | return ret; |
501 | } |
502 | |
503 | bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) |
504 | { |
505 | bool ok = inet_ehash_insert(sk, osk); |
506 | |
507 | if (ok) { |
508 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
509 | } else { |
510 | percpu_counter_inc(sk->sk_prot->orphan_count); |
511 | inet_sk_set_state(sk, TCP_CLOSE); |
512 | sock_set_flag(sk, SOCK_DEAD); |
513 | inet_csk_destroy_sock(sk); |
514 | } |
515 | return ok; |
516 | } |
517 | EXPORT_SYMBOL_GPL(inet_ehash_nolisten); |
518 | |
519 | static int inet_reuseport_add_sock(struct sock *sk, |
520 | struct inet_listen_hashbucket *ilb) |
521 | { |
522 | struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; |
523 | struct sock *sk2; |
524 | kuid_t uid = sock_i_uid(sk); |
525 | |
526 | sk_for_each_rcu(sk2, &ilb->head) { |
527 | if (sk2 != sk && |
528 | sk2->sk_family == sk->sk_family && |
529 | ipv6_only_sock(sk2) == ipv6_only_sock(sk) && |
530 | sk2->sk_bound_dev_if == sk->sk_bound_dev_if && |
531 | inet_csk(sk2)->icsk_bind_hash == tb && |
532 | sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && |
533 | inet_rcv_saddr_equal(sk, sk2, false)) |
534 | return reuseport_add_sock(sk, sk2, |
535 | inet_rcv_saddr_any(sk)); |
536 | } |
537 | |
538 | return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); |
539 | } |
540 | |
541 | int __inet_hash(struct sock *sk, struct sock *osk) |
542 | { |
543 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
544 | struct inet_listen_hashbucket *ilb; |
545 | int err = 0; |
546 | |
547 | if (sk->sk_state != TCP_LISTEN) { |
548 | inet_ehash_nolisten(sk, osk); |
549 | return 0; |
550 | } |
551 | WARN_ON(!sk_unhashed(sk)); |
552 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
553 | |
554 | spin_lock(&ilb->lock); |
555 | if (sk->sk_reuseport) { |
556 | err = inet_reuseport_add_sock(sk, ilb); |
557 | if (err) |
558 | goto unlock; |
559 | } |
560 | if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && |
561 | sk->sk_family == AF_INET6) |
562 | hlist_add_tail_rcu(&sk->sk_node, &ilb->head); |
563 | else |
564 | hlist_add_head_rcu(&sk->sk_node, &ilb->head); |
565 | inet_hash2(hashinfo, sk); |
566 | ilb->count++; |
567 | sock_set_flag(sk, SOCK_RCU_FREE); |
568 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
569 | unlock: |
570 | spin_unlock(&ilb->lock); |
571 | |
572 | return err; |
573 | } |
574 | EXPORT_SYMBOL(__inet_hash); |
575 | |
576 | int inet_hash(struct sock *sk) |
577 | { |
578 | int err = 0; |
579 | |
580 | if (sk->sk_state != TCP_CLOSE) { |
581 | local_bh_disable(); |
582 | err = __inet_hash(sk, NULL); |
583 | local_bh_enable(); |
584 | } |
585 | |
586 | return err; |
587 | } |
588 | EXPORT_SYMBOL_GPL(inet_hash); |
589 | |
590 | void inet_unhash(struct sock *sk) |
591 | { |
592 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
593 | struct inet_listen_hashbucket *ilb = NULL; |
594 | spinlock_t *lock; |
595 | |
596 | if (sk_unhashed(sk)) |
597 | return; |
598 | |
599 | if (sk->sk_state == TCP_LISTEN) { |
600 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
601 | lock = &ilb->lock; |
602 | } else { |
603 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
604 | } |
605 | spin_lock_bh(lock); |
606 | if (sk_unhashed(sk)) |
607 | goto unlock; |
608 | |
609 | if (rcu_access_pointer(sk->sk_reuseport_cb)) |
610 | reuseport_detach_sock(sk); |
611 | if (ilb) { |
612 | inet_unhash2(hashinfo, sk); |
613 | __sk_del_node_init(sk); |
614 | ilb->count--; |
615 | } else { |
616 | __sk_nulls_del_node_init_rcu(sk); |
617 | } |
618 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
619 | unlock: |
620 | spin_unlock_bh(lock); |
621 | } |
622 | EXPORT_SYMBOL_GPL(inet_unhash); |
623 | |
624 | int __inet_hash_connect(struct inet_timewait_death_row *death_row, |
625 | struct sock *sk, u32 port_offset, |
626 | int (*check_established)(struct inet_timewait_death_row *, |
627 | struct sock *, __u16, struct inet_timewait_sock **)) |
628 | { |
629 | struct inet_hashinfo *hinfo = death_row->hashinfo; |
630 | struct inet_timewait_sock *tw = NULL; |
631 | struct inet_bind_hashbucket *head; |
632 | int port = inet_sk(sk)->inet_num; |
633 | struct net *net = sock_net(sk); |
634 | struct inet_bind_bucket *tb; |
635 | u32 remaining, offset; |
636 | int ret, i, low, high; |
637 | static u32 hint; |
638 | int l3mdev; |
639 | |
640 | if (port) { |
641 | head = &hinfo->bhash[inet_bhashfn(net, port, |
642 | hinfo->bhash_size)]; |
643 | tb = inet_csk(sk)->icsk_bind_hash; |
644 | spin_lock_bh(&head->lock); |
645 | if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { |
646 | inet_ehash_nolisten(sk, NULL); |
647 | spin_unlock_bh(&head->lock); |
648 | return 0; |
649 | } |
650 | spin_unlock(&head->lock); |
651 | /* No definite answer... Walk to established hash table */ |
652 | ret = check_established(death_row, sk, port, NULL); |
653 | local_bh_enable(); |
654 | return ret; |
655 | } |
656 | |
657 | l3mdev = inet_sk_bound_l3mdev(sk); |
658 | |
659 | inet_get_local_port_range(net, &low, &high); |
660 | high++; /* [32768, 60999] -> [32768, 61000[ */ |
661 | remaining = high - low; |
662 | if (likely(remaining > 1)) |
663 | remaining &= ~1U; |
664 | |
665 | offset = (hint + port_offset) % remaining; |
666 | /* In first pass we try ports of @low parity. |
667 | * inet_csk_get_port() does the opposite choice. |
668 | */ |
669 | offset &= ~1U; |
670 | other_parity_scan: |
671 | port = low + offset; |
672 | for (i = 0; i < remaining; i += 2, port += 2) { |
673 | if (unlikely(port >= high)) |
674 | port -= remaining; |
675 | if (inet_is_local_reserved_port(net, port)) |
676 | continue; |
677 | head = &hinfo->bhash[inet_bhashfn(net, port, |
678 | hinfo->bhash_size)]; |
679 | spin_lock_bh(&head->lock); |
680 | |
681 | /* Does not bother with rcv_saddr checks, because |
682 | * the established check is already unique enough. |
683 | */ |
684 | inet_bind_bucket_for_each(tb, &head->chain) { |
685 | if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && |
686 | tb->port == port) { |
687 | if (tb->fastreuse >= 0 || |
688 | tb->fastreuseport >= 0) |
689 | goto next_port; |
690 | WARN_ON(hlist_empty(&tb->owners)); |
691 | if (!check_established(death_row, sk, |
692 | port, &tw)) |
693 | goto ok; |
694 | goto next_port; |
695 | } |
696 | } |
697 | |
698 | tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, |
699 | net, head, port, l3mdev); |
700 | if (!tb) { |
701 | spin_unlock_bh(&head->lock); |
702 | return -ENOMEM; |
703 | } |
704 | tb->fastreuse = -1; |
705 | tb->fastreuseport = -1; |
706 | goto ok; |
707 | next_port: |
708 | spin_unlock_bh(&head->lock); |
709 | cond_resched(); |
710 | } |
711 | |
712 | offset++; |
713 | if ((offset & 1) && remaining > 1) |
714 | goto other_parity_scan; |
715 | |
716 | return -EADDRNOTAVAIL; |
717 | |
718 | ok: |
719 | hint += i + 2; |
720 | |
721 | /* Head lock still held and bh's disabled */ |
722 | inet_bind_hash(sk, tb, port); |
723 | if (sk_unhashed(sk)) { |
724 | inet_sk(sk)->inet_sport = htons(port); |
725 | inet_ehash_nolisten(sk, (struct sock *)tw); |
726 | } |
727 | if (tw) |
728 | inet_twsk_bind_unhash(tw, hinfo); |
729 | spin_unlock(&head->lock); |
730 | if (tw) |
731 | inet_twsk_deschedule_put(tw); |
732 | local_bh_enable(); |
733 | return 0; |
734 | } |
735 | |
736 | /* |
737 | * Bind a port for a connect operation and hash it. |
738 | */ |
739 | int inet_hash_connect(struct inet_timewait_death_row *death_row, |
740 | struct sock *sk) |
741 | { |
742 | u32 port_offset = 0; |
743 | |
744 | if (!inet_sk(sk)->inet_num) |
745 | port_offset = inet_sk_port_offset(sk); |
746 | return __inet_hash_connect(death_row, sk, port_offset, |
747 | __inet_check_established); |
748 | } |
749 | EXPORT_SYMBOL_GPL(inet_hash_connect); |
750 | |
751 | void inet_hashinfo_init(struct inet_hashinfo *h) |
752 | { |
753 | int i; |
754 | |
755 | for (i = 0; i < INET_LHTABLE_SIZE; i++) { |
756 | spin_lock_init(&h->listening_hash[i].lock); |
757 | INIT_HLIST_HEAD(&h->listening_hash[i].head); |
758 | h->listening_hash[i].count = 0; |
759 | } |
760 | |
761 | h->lhash2 = NULL; |
762 | } |
763 | EXPORT_SYMBOL_GPL(inet_hashinfo_init); |
764 | |
765 | static void init_hashinfo_lhash2(struct inet_hashinfo *h) |
766 | { |
767 | int i; |
768 | |
769 | for (i = 0; i <= h->lhash2_mask; i++) { |
770 | spin_lock_init(&h->lhash2[i].lock); |
771 | INIT_HLIST_HEAD(&h->lhash2[i].head); |
772 | h->lhash2[i].count = 0; |
773 | } |
774 | } |
775 | |
776 | void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, |
777 | unsigned long numentries, int scale, |
778 | unsigned long low_limit, |
779 | unsigned long high_limit) |
780 | { |
781 | h->lhash2 = alloc_large_system_hash(name, |
782 | sizeof(*h->lhash2), |
783 | numentries, |
784 | scale, |
785 | 0, |
786 | NULL, |
787 | &h->lhash2_mask, |
788 | low_limit, |
789 | high_limit); |
790 | init_hashinfo_lhash2(h); |
791 | } |
792 | |
793 | int inet_hashinfo2_init_mod(struct inet_hashinfo *h) |
794 | { |
795 | h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); |
796 | if (!h->lhash2) |
797 | return -ENOMEM; |
798 | |
799 | h->lhash2_mask = INET_LHTABLE_SIZE - 1; |
800 | /* INET_LHTABLE_SIZE must be a power of 2 */ |
801 | BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); |
802 | |
803 | init_hashinfo_lhash2(h); |
804 | return 0; |
805 | } |
806 | EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); |
807 | |
808 | int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) |
809 | { |
810 | unsigned int locksz = sizeof(spinlock_t); |
811 | unsigned int i, nblocks = 1; |
812 | |
813 | if (locksz != 0) { |
814 | /* allocate 2 cache lines or at least one spinlock per cpu */ |
815 | nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); |
816 | nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); |
817 | |
818 | /* no more locks than number of hash buckets */ |
819 | nblocks = min(nblocks, hashinfo->ehash_mask + 1); |
820 | |
821 | hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); |
822 | if (!hashinfo->ehash_locks) |
823 | return -ENOMEM; |
824 | |
825 | for (i = 0; i < nblocks; i++) |
826 | spin_lock_init(&hashinfo->ehash_locks[i]); |
827 | } |
828 | hashinfo->ehash_locks_mask = nblocks - 1; |
829 | return 0; |
830 | } |
831 | EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); |
832 | |