1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Anycast support for IPv6 |
4 | * Linux INET6 implementation |
5 | * |
6 | * Authors: |
7 | * David L Stevens (dlstevens@us.ibm.com) |
8 | * |
9 | * based heavily on net/ipv6/mcast.c |
10 | */ |
11 | |
12 | #include <linux/capability.h> |
13 | #include <linux/module.h> |
14 | #include <linux/errno.h> |
15 | #include <linux/types.h> |
16 | #include <linux/random.h> |
17 | #include <linux/string.h> |
18 | #include <linux/socket.h> |
19 | #include <linux/sockios.h> |
20 | #include <linux/net.h> |
21 | #include <linux/in6.h> |
22 | #include <linux/netdevice.h> |
23 | #include <linux/if_arp.h> |
24 | #include <linux/route.h> |
25 | #include <linux/init.h> |
26 | #include <linux/proc_fs.h> |
27 | #include <linux/seq_file.h> |
28 | #include <linux/slab.h> |
29 | |
30 | #include <net/net_namespace.h> |
31 | #include <net/sock.h> |
32 | #include <net/snmp.h> |
33 | |
34 | #include <net/ipv6.h> |
35 | #include <net/protocol.h> |
36 | #include <net/if_inet6.h> |
37 | #include <net/ndisc.h> |
38 | #include <net/addrconf.h> |
39 | #include <net/ip6_route.h> |
40 | |
41 | #include <net/checksum.h> |
42 | |
43 | #define IN6_ADDR_HSIZE_SHIFT 8 |
44 | #define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT) |
45 | /* anycast address hash table |
46 | */ |
47 | static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; |
48 | static DEFINE_SPINLOCK(acaddr_hash_lock); |
49 | |
50 | static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); |
51 | |
52 | static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) |
53 | { |
54 | u32 val = ipv6_addr_hash(a: addr) ^ net_hash_mix(net); |
55 | |
56 | return hash_32(val, IN6_ADDR_HSIZE_SHIFT); |
57 | } |
58 | |
59 | /* |
60 | * socket join an anycast group |
61 | */ |
62 | |
63 | int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) |
64 | { |
65 | struct ipv6_pinfo *np = inet6_sk(sk: sk); |
66 | struct net_device *dev = NULL; |
67 | struct inet6_dev *idev; |
68 | struct ipv6_ac_socklist *pac; |
69 | struct net *net = sock_net(sk); |
70 | int ishost = !net->ipv6.devconf_all->forwarding; |
71 | int err = 0; |
72 | |
73 | ASSERT_RTNL(); |
74 | |
75 | if (!ns_capable(ns: net->user_ns, CAP_NET_ADMIN)) |
76 | return -EPERM; |
77 | if (ipv6_addr_is_multicast(addr)) |
78 | return -EINVAL; |
79 | |
80 | if (ifindex) |
81 | dev = __dev_get_by_index(net, ifindex); |
82 | |
83 | if (ipv6_chk_addr_and_flags(net, addr, dev, skip_dev_check: true, strict: 0, IFA_F_TENTATIVE)) |
84 | return -EINVAL; |
85 | |
86 | pac = sock_kmalloc(sk, size: sizeof(struct ipv6_ac_socklist), GFP_KERNEL); |
87 | if (!pac) |
88 | return -ENOMEM; |
89 | pac->acl_next = NULL; |
90 | pac->acl_addr = *addr; |
91 | |
92 | if (ifindex == 0) { |
93 | struct rt6_info *rt; |
94 | |
95 | rt = rt6_lookup(net, daddr: addr, NULL, oif: 0, NULL, flags: 0); |
96 | if (rt) { |
97 | dev = rt->dst.dev; |
98 | ip6_rt_put(rt); |
99 | } else if (ishost) { |
100 | err = -EADDRNOTAVAIL; |
101 | goto error; |
102 | } else { |
103 | /* router, no matching interface: just pick one */ |
104 | dev = __dev_get_by_flags(net, IFF_UP, |
105 | IFF_UP | IFF_LOOPBACK); |
106 | } |
107 | } |
108 | |
109 | if (!dev) { |
110 | err = -ENODEV; |
111 | goto error; |
112 | } |
113 | |
114 | idev = __in6_dev_get(dev); |
115 | if (!idev) { |
116 | if (ifindex) |
117 | err = -ENODEV; |
118 | else |
119 | err = -EADDRNOTAVAIL; |
120 | goto error; |
121 | } |
122 | /* reset ishost, now that we have a specific device */ |
123 | ishost = !idev->cnf.forwarding; |
124 | |
125 | pac->acl_ifindex = dev->ifindex; |
126 | |
127 | /* XXX |
128 | * For hosts, allow link-local or matching prefix anycasts. |
129 | * This obviates the need for propagating anycast routes while |
130 | * still allowing some non-router anycast participation. |
131 | */ |
132 | if (!ipv6_chk_prefix(addr, dev)) { |
133 | if (ishost) |
134 | err = -EADDRNOTAVAIL; |
135 | if (err) |
136 | goto error; |
137 | } |
138 | |
139 | err = __ipv6_dev_ac_inc(idev, addr); |
140 | if (!err) { |
141 | pac->acl_next = np->ipv6_ac_list; |
142 | np->ipv6_ac_list = pac; |
143 | pac = NULL; |
144 | } |
145 | |
146 | error: |
147 | if (pac) |
148 | sock_kfree_s(sk, mem: pac, size: sizeof(*pac)); |
149 | return err; |
150 | } |
151 | |
152 | /* |
153 | * socket leave an anycast group |
154 | */ |
155 | int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) |
156 | { |
157 | struct ipv6_pinfo *np = inet6_sk(sk: sk); |
158 | struct net_device *dev; |
159 | struct ipv6_ac_socklist *pac, *prev_pac; |
160 | struct net *net = sock_net(sk); |
161 | |
162 | ASSERT_RTNL(); |
163 | |
164 | prev_pac = NULL; |
165 | for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { |
166 | if ((ifindex == 0 || pac->acl_ifindex == ifindex) && |
167 | ipv6_addr_equal(a1: &pac->acl_addr, a2: addr)) |
168 | break; |
169 | prev_pac = pac; |
170 | } |
171 | if (!pac) |
172 | return -ENOENT; |
173 | if (prev_pac) |
174 | prev_pac->acl_next = pac->acl_next; |
175 | else |
176 | np->ipv6_ac_list = pac->acl_next; |
177 | |
178 | dev = __dev_get_by_index(net, ifindex: pac->acl_ifindex); |
179 | if (dev) |
180 | ipv6_dev_ac_dec(dev, addr: &pac->acl_addr); |
181 | |
182 | sock_kfree_s(sk, mem: pac, size: sizeof(*pac)); |
183 | return 0; |
184 | } |
185 | |
186 | void __ipv6_sock_ac_close(struct sock *sk) |
187 | { |
188 | struct ipv6_pinfo *np = inet6_sk(sk: sk); |
189 | struct net_device *dev = NULL; |
190 | struct ipv6_ac_socklist *pac; |
191 | struct net *net = sock_net(sk); |
192 | int prev_index; |
193 | |
194 | ASSERT_RTNL(); |
195 | pac = np->ipv6_ac_list; |
196 | np->ipv6_ac_list = NULL; |
197 | |
198 | prev_index = 0; |
199 | while (pac) { |
200 | struct ipv6_ac_socklist *next = pac->acl_next; |
201 | |
202 | if (pac->acl_ifindex != prev_index) { |
203 | dev = __dev_get_by_index(net, ifindex: pac->acl_ifindex); |
204 | prev_index = pac->acl_ifindex; |
205 | } |
206 | if (dev) |
207 | ipv6_dev_ac_dec(dev, addr: &pac->acl_addr); |
208 | sock_kfree_s(sk, mem: pac, size: sizeof(*pac)); |
209 | pac = next; |
210 | } |
211 | } |
212 | |
213 | void ipv6_sock_ac_close(struct sock *sk) |
214 | { |
215 | struct ipv6_pinfo *np = inet6_sk(sk: sk); |
216 | |
217 | if (!np->ipv6_ac_list) |
218 | return; |
219 | rtnl_lock(); |
220 | __ipv6_sock_ac_close(sk); |
221 | rtnl_unlock(); |
222 | } |
223 | |
224 | static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) |
225 | { |
226 | unsigned int hash = inet6_acaddr_hash(net, addr: &aca->aca_addr); |
227 | |
228 | spin_lock(lock: &acaddr_hash_lock); |
229 | hlist_add_head_rcu(n: &aca->aca_addr_lst, h: &inet6_acaddr_lst[hash]); |
230 | spin_unlock(lock: &acaddr_hash_lock); |
231 | } |
232 | |
233 | static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca) |
234 | { |
235 | spin_lock(lock: &acaddr_hash_lock); |
236 | hlist_del_init_rcu(n: &aca->aca_addr_lst); |
237 | spin_unlock(lock: &acaddr_hash_lock); |
238 | } |
239 | |
240 | static void aca_get(struct ifacaddr6 *aca) |
241 | { |
242 | refcount_inc(r: &aca->aca_refcnt); |
243 | } |
244 | |
245 | static void aca_free_rcu(struct rcu_head *h) |
246 | { |
247 | struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu); |
248 | |
249 | fib6_info_release(f6i: aca->aca_rt); |
250 | kfree(objp: aca); |
251 | } |
252 | |
253 | static void aca_put(struct ifacaddr6 *ac) |
254 | { |
255 | if (refcount_dec_and_test(r: &ac->aca_refcnt)) { |
256 | call_rcu(head: &ac->rcu, func: aca_free_rcu); |
257 | } |
258 | } |
259 | |
260 | static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, |
261 | const struct in6_addr *addr) |
262 | { |
263 | struct ifacaddr6 *aca; |
264 | |
265 | aca = kzalloc(size: sizeof(*aca), GFP_ATOMIC); |
266 | if (!aca) |
267 | return NULL; |
268 | |
269 | aca->aca_addr = *addr; |
270 | fib6_info_hold(f6i); |
271 | aca->aca_rt = f6i; |
272 | INIT_HLIST_NODE(h: &aca->aca_addr_lst); |
273 | aca->aca_users = 1; |
274 | /* aca_tstamp should be updated upon changes */ |
275 | aca->aca_cstamp = aca->aca_tstamp = jiffies; |
276 | refcount_set(r: &aca->aca_refcnt, n: 1); |
277 | |
278 | return aca; |
279 | } |
280 | |
281 | /* |
282 | * device anycast group inc (add if not found) |
283 | */ |
284 | int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) |
285 | { |
286 | struct ifacaddr6 *aca; |
287 | struct fib6_info *f6i; |
288 | struct net *net; |
289 | int err; |
290 | |
291 | ASSERT_RTNL(); |
292 | |
293 | write_lock_bh(&idev->lock); |
294 | if (idev->dead) { |
295 | err = -ENODEV; |
296 | goto out; |
297 | } |
298 | |
299 | for (aca = rtnl_dereference(idev->ac_list); aca; |
300 | aca = rtnl_dereference(aca->aca_next)) { |
301 | if (ipv6_addr_equal(a1: &aca->aca_addr, a2: addr)) { |
302 | aca->aca_users++; |
303 | err = 0; |
304 | goto out; |
305 | } |
306 | } |
307 | |
308 | net = dev_net(dev: idev->dev); |
309 | f6i = addrconf_f6i_alloc(net, idev, addr, anycast: true, GFP_ATOMIC, NULL); |
310 | if (IS_ERR(ptr: f6i)) { |
311 | err = PTR_ERR(ptr: f6i); |
312 | goto out; |
313 | } |
314 | aca = aca_alloc(f6i, addr); |
315 | if (!aca) { |
316 | fib6_info_release(f6i); |
317 | err = -ENOMEM; |
318 | goto out; |
319 | } |
320 | |
321 | /* Hold this for addrconf_join_solict() below before we unlock, |
322 | * it is already exposed via idev->ac_list. |
323 | */ |
324 | aca_get(aca); |
325 | aca->aca_next = idev->ac_list; |
326 | rcu_assign_pointer(idev->ac_list, aca); |
327 | |
328 | write_unlock_bh(&idev->lock); |
329 | |
330 | ipv6_add_acaddr_hash(net, aca); |
331 | |
332 | ip6_ins_rt(net, f6i); |
333 | |
334 | addrconf_join_solict(dev: idev->dev, addr: &aca->aca_addr); |
335 | |
336 | aca_put(ac: aca); |
337 | return 0; |
338 | out: |
339 | write_unlock_bh(&idev->lock); |
340 | return err; |
341 | } |
342 | |
343 | /* |
344 | * device anycast group decrement |
345 | */ |
346 | int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) |
347 | { |
348 | struct ifacaddr6 *aca, *prev_aca; |
349 | |
350 | ASSERT_RTNL(); |
351 | |
352 | write_lock_bh(&idev->lock); |
353 | prev_aca = NULL; |
354 | for (aca = rtnl_dereference(idev->ac_list); aca; |
355 | aca = rtnl_dereference(aca->aca_next)) { |
356 | if (ipv6_addr_equal(a1: &aca->aca_addr, a2: addr)) |
357 | break; |
358 | prev_aca = aca; |
359 | } |
360 | if (!aca) { |
361 | write_unlock_bh(&idev->lock); |
362 | return -ENOENT; |
363 | } |
364 | if (--aca->aca_users > 0) { |
365 | write_unlock_bh(&idev->lock); |
366 | return 0; |
367 | } |
368 | if (prev_aca) |
369 | rcu_assign_pointer(prev_aca->aca_next, aca->aca_next); |
370 | else |
371 | rcu_assign_pointer(idev->ac_list, aca->aca_next); |
372 | write_unlock_bh(&idev->lock); |
373 | ipv6_del_acaddr_hash(aca); |
374 | addrconf_leave_solict(idev, addr: &aca->aca_addr); |
375 | |
376 | ip6_del_rt(net: dev_net(dev: idev->dev), f6i: aca->aca_rt, skip_notify: false); |
377 | |
378 | aca_put(ac: aca); |
379 | return 0; |
380 | } |
381 | |
382 | /* called with rtnl_lock() */ |
383 | static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) |
384 | { |
385 | struct inet6_dev *idev = __in6_dev_get(dev); |
386 | |
387 | if (!idev) |
388 | return -ENODEV; |
389 | return __ipv6_dev_ac_dec(idev, addr); |
390 | } |
391 | |
392 | void ipv6_ac_destroy_dev(struct inet6_dev *idev) |
393 | { |
394 | struct ifacaddr6 *aca; |
395 | |
396 | write_lock_bh(&idev->lock); |
397 | while ((aca = rtnl_dereference(idev->ac_list)) != NULL) { |
398 | rcu_assign_pointer(idev->ac_list, aca->aca_next); |
399 | write_unlock_bh(&idev->lock); |
400 | |
401 | ipv6_del_acaddr_hash(aca); |
402 | |
403 | addrconf_leave_solict(idev, addr: &aca->aca_addr); |
404 | |
405 | ip6_del_rt(net: dev_net(dev: idev->dev), f6i: aca->aca_rt, skip_notify: false); |
406 | |
407 | aca_put(ac: aca); |
408 | |
409 | write_lock_bh(&idev->lock); |
410 | } |
411 | write_unlock_bh(&idev->lock); |
412 | } |
413 | |
414 | /* |
415 | * check if the interface has this anycast address |
416 | * called with rcu_read_lock() |
417 | */ |
418 | static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr) |
419 | { |
420 | struct inet6_dev *idev; |
421 | struct ifacaddr6 *aca; |
422 | |
423 | idev = __in6_dev_get(dev); |
424 | if (idev) { |
425 | for (aca = rcu_dereference(idev->ac_list); aca; |
426 | aca = rcu_dereference(aca->aca_next)) |
427 | if (ipv6_addr_equal(a1: &aca->aca_addr, a2: addr)) |
428 | break; |
429 | return aca != NULL; |
430 | } |
431 | return false; |
432 | } |
433 | |
434 | /* |
435 | * check if given interface (or any, if dev==0) has this anycast address |
436 | */ |
437 | bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, |
438 | const struct in6_addr *addr) |
439 | { |
440 | struct net_device *nh_dev; |
441 | struct ifacaddr6 *aca; |
442 | bool found = false; |
443 | |
444 | rcu_read_lock(); |
445 | if (dev) |
446 | found = ipv6_chk_acast_dev(dev, addr); |
447 | else { |
448 | unsigned int hash = inet6_acaddr_hash(net, addr); |
449 | |
450 | hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash], |
451 | aca_addr_lst) { |
452 | nh_dev = fib6_info_nh_dev(f6i: aca->aca_rt); |
453 | if (!nh_dev || !net_eq(net1: dev_net(dev: nh_dev), net2: net)) |
454 | continue; |
455 | if (ipv6_addr_equal(a1: &aca->aca_addr, a2: addr)) { |
456 | found = true; |
457 | break; |
458 | } |
459 | } |
460 | } |
461 | rcu_read_unlock(); |
462 | return found; |
463 | } |
464 | |
465 | /* check if this anycast address is link-local on given interface or |
466 | * is global |
467 | */ |
468 | bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, |
469 | const struct in6_addr *addr) |
470 | { |
471 | return ipv6_chk_acast_addr(net, |
472 | dev: (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ? |
473 | dev : NULL), |
474 | addr); |
475 | } |
476 | |
477 | #ifdef CONFIG_PROC_FS |
478 | struct ac6_iter_state { |
479 | struct seq_net_private p; |
480 | struct net_device *dev; |
481 | }; |
482 | |
483 | #define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) |
484 | |
485 | static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) |
486 | { |
487 | struct ac6_iter_state *state = ac6_seq_private(seq); |
488 | struct net *net = seq_file_net(seq); |
489 | struct ifacaddr6 *im = NULL; |
490 | |
491 | for_each_netdev_rcu(net, state->dev) { |
492 | struct inet6_dev *idev; |
493 | |
494 | idev = __in6_dev_get(dev: state->dev); |
495 | if (!idev) |
496 | continue; |
497 | im = rcu_dereference(idev->ac_list); |
498 | if (im) |
499 | break; |
500 | } |
501 | return im; |
502 | } |
503 | |
504 | static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) |
505 | { |
506 | struct ac6_iter_state *state = ac6_seq_private(seq); |
507 | struct inet6_dev *idev; |
508 | |
509 | im = rcu_dereference(im->aca_next); |
510 | while (!im) { |
511 | state->dev = next_net_device_rcu(dev: state->dev); |
512 | if (!state->dev) |
513 | break; |
514 | idev = __in6_dev_get(dev: state->dev); |
515 | if (!idev) |
516 | continue; |
517 | im = rcu_dereference(idev->ac_list); |
518 | } |
519 | return im; |
520 | } |
521 | |
522 | static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) |
523 | { |
524 | struct ifacaddr6 *im = ac6_get_first(seq); |
525 | if (im) |
526 | while (pos && (im = ac6_get_next(seq, im)) != NULL) |
527 | --pos; |
528 | return pos ? NULL : im; |
529 | } |
530 | |
531 | static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) |
532 | __acquires(RCU) |
533 | { |
534 | rcu_read_lock(); |
535 | return ac6_get_idx(seq, pos: *pos); |
536 | } |
537 | |
538 | static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
539 | { |
540 | struct ifacaddr6 *im = ac6_get_next(seq, im: v); |
541 | |
542 | ++*pos; |
543 | return im; |
544 | } |
545 | |
546 | static void ac6_seq_stop(struct seq_file *seq, void *v) |
547 | __releases(RCU) |
548 | { |
549 | rcu_read_unlock(); |
550 | } |
551 | |
552 | static int ac6_seq_show(struct seq_file *seq, void *v) |
553 | { |
554 | struct ifacaddr6 *im = (struct ifacaddr6 *)v; |
555 | struct ac6_iter_state *state = ac6_seq_private(seq); |
556 | |
557 | seq_printf(m: seq, fmt: "%-4d %-15s %pi6 %5d\n" , |
558 | state->dev->ifindex, state->dev->name, |
559 | &im->aca_addr, im->aca_users); |
560 | return 0; |
561 | } |
562 | |
563 | static const struct seq_operations ac6_seq_ops = { |
564 | .start = ac6_seq_start, |
565 | .next = ac6_seq_next, |
566 | .stop = ac6_seq_stop, |
567 | .show = ac6_seq_show, |
568 | }; |
569 | |
570 | int __net_init ac6_proc_init(struct net *net) |
571 | { |
572 | if (!proc_create_net("anycast6" , 0444, net->proc_net, &ac6_seq_ops, |
573 | sizeof(struct ac6_iter_state))) |
574 | return -ENOMEM; |
575 | |
576 | return 0; |
577 | } |
578 | |
579 | void ac6_proc_exit(struct net *net) |
580 | { |
581 | remove_proc_entry("anycast6" , net->proc_net); |
582 | } |
583 | #endif |
584 | |
585 | /* Init / cleanup code |
586 | */ |
587 | int __init ipv6_anycast_init(void) |
588 | { |
589 | int i; |
590 | |
591 | for (i = 0; i < IN6_ADDR_HSIZE; i++) |
592 | INIT_HLIST_HEAD(&inet6_acaddr_lst[i]); |
593 | return 0; |
594 | } |
595 | |
596 | void ipv6_anycast_cleanup(void) |
597 | { |
598 | int i; |
599 | |
600 | spin_lock(lock: &acaddr_hash_lock); |
601 | for (i = 0; i < IN6_ADDR_HSIZE; i++) |
602 | WARN_ON(!hlist_empty(&inet6_acaddr_lst[i])); |
603 | spin_unlock(lock: &acaddr_hash_lock); |
604 | } |
605 | |