1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * (C) 1999-2001 Paul `Rusty' Russell |
4 | * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> |
5 | * (C) 2011 Patrick McHardy <kaber@trash.net> |
6 | */ |
7 | |
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
9 | |
10 | #include <linux/module.h> |
11 | #include <linux/types.h> |
12 | #include <linux/timer.h> |
13 | #include <linux/skbuff.h> |
14 | #include <linux/gfp.h> |
15 | #include <net/xfrm.h> |
16 | #include <linux/siphash.h> |
17 | #include <linux/rtnetlink.h> |
18 | |
19 | #include <net/netfilter/nf_conntrack_bpf.h> |
20 | #include <net/netfilter/nf_conntrack_core.h> |
21 | #include <net/netfilter/nf_conntrack_helper.h> |
22 | #include <net/netfilter/nf_conntrack_seqadj.h> |
23 | #include <net/netfilter/nf_conntrack_zones.h> |
24 | #include <net/netfilter/nf_nat.h> |
25 | #include <net/netfilter/nf_nat_helper.h> |
26 | #include <uapi/linux/netfilter/nf_nat.h> |
27 | |
28 | #include "nf_internals.h" |
29 | |
30 | #define NF_NAT_MAX_ATTEMPTS 128 |
31 | #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) |
32 | |
33 | static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; |
34 | |
35 | static DEFINE_MUTEX(nf_nat_proto_mutex); |
36 | static unsigned int nat_net_id __read_mostly; |
37 | |
38 | static struct hlist_head *nf_nat_bysource __read_mostly; |
39 | static unsigned int nf_nat_htable_size __read_mostly; |
40 | static siphash_aligned_key_t nf_nat_hash_rnd; |
41 | |
42 | struct nf_nat_lookup_hook_priv { |
43 | struct nf_hook_entries __rcu *entries; |
44 | |
45 | struct rcu_head rcu_head; |
46 | }; |
47 | |
48 | struct nf_nat_hooks_net { |
49 | struct nf_hook_ops *nat_hook_ops; |
50 | unsigned int users; |
51 | }; |
52 | |
53 | struct nat_net { |
54 | struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; |
55 | }; |
56 | |
57 | #ifdef CONFIG_XFRM |
58 | static void nf_nat_ipv4_decode_session(struct sk_buff *skb, |
59 | const struct nf_conn *ct, |
60 | enum ip_conntrack_dir dir, |
61 | unsigned long statusbit, |
62 | struct flowi *fl) |
63 | { |
64 | const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; |
65 | struct flowi4 *fl4 = &fl->u.ip4; |
66 | |
67 | if (ct->status & statusbit) { |
68 | fl4->daddr = t->dst.u3.ip; |
69 | if (t->dst.protonum == IPPROTO_TCP || |
70 | t->dst.protonum == IPPROTO_UDP || |
71 | t->dst.protonum == IPPROTO_UDPLITE || |
72 | t->dst.protonum == IPPROTO_DCCP || |
73 | t->dst.protonum == IPPROTO_SCTP) |
74 | fl4->fl4_dport = t->dst.u.all; |
75 | } |
76 | |
77 | statusbit ^= IPS_NAT_MASK; |
78 | |
79 | if (ct->status & statusbit) { |
80 | fl4->saddr = t->src.u3.ip; |
81 | if (t->dst.protonum == IPPROTO_TCP || |
82 | t->dst.protonum == IPPROTO_UDP || |
83 | t->dst.protonum == IPPROTO_UDPLITE || |
84 | t->dst.protonum == IPPROTO_DCCP || |
85 | t->dst.protonum == IPPROTO_SCTP) |
86 | fl4->fl4_sport = t->src.u.all; |
87 | } |
88 | } |
89 | |
90 | static void nf_nat_ipv6_decode_session(struct sk_buff *skb, |
91 | const struct nf_conn *ct, |
92 | enum ip_conntrack_dir dir, |
93 | unsigned long statusbit, |
94 | struct flowi *fl) |
95 | { |
96 | #if IS_ENABLED(CONFIG_IPV6) |
97 | const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; |
98 | struct flowi6 *fl6 = &fl->u.ip6; |
99 | |
100 | if (ct->status & statusbit) { |
101 | fl6->daddr = t->dst.u3.in6; |
102 | if (t->dst.protonum == IPPROTO_TCP || |
103 | t->dst.protonum == IPPROTO_UDP || |
104 | t->dst.protonum == IPPROTO_UDPLITE || |
105 | t->dst.protonum == IPPROTO_DCCP || |
106 | t->dst.protonum == IPPROTO_SCTP) |
107 | fl6->fl6_dport = t->dst.u.all; |
108 | } |
109 | |
110 | statusbit ^= IPS_NAT_MASK; |
111 | |
112 | if (ct->status & statusbit) { |
113 | fl6->saddr = t->src.u3.in6; |
114 | if (t->dst.protonum == IPPROTO_TCP || |
115 | t->dst.protonum == IPPROTO_UDP || |
116 | t->dst.protonum == IPPROTO_UDPLITE || |
117 | t->dst.protonum == IPPROTO_DCCP || |
118 | t->dst.protonum == IPPROTO_SCTP) |
119 | fl6->fl6_sport = t->src.u.all; |
120 | } |
121 | #endif |
122 | } |
123 | |
124 | static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) |
125 | { |
126 | const struct nf_conn *ct; |
127 | enum ip_conntrack_info ctinfo; |
128 | enum ip_conntrack_dir dir; |
129 | unsigned long statusbit; |
130 | u8 family; |
131 | |
132 | ct = nf_ct_get(skb, ctinfo: &ctinfo); |
133 | if (ct == NULL) |
134 | return; |
135 | |
136 | family = nf_ct_l3num(ct); |
137 | dir = CTINFO2DIR(ctinfo); |
138 | if (dir == IP_CT_DIR_ORIGINAL) |
139 | statusbit = IPS_DST_NAT; |
140 | else |
141 | statusbit = IPS_SRC_NAT; |
142 | |
143 | switch (family) { |
144 | case NFPROTO_IPV4: |
145 | nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); |
146 | return; |
147 | case NFPROTO_IPV6: |
148 | nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); |
149 | return; |
150 | } |
151 | } |
152 | #endif /* CONFIG_XFRM */ |
153 | |
154 | /* We keep an extra hash for each conntrack, for fast searching. */ |
155 | static unsigned int |
156 | hash_by_src(const struct net *net, |
157 | const struct nf_conntrack_zone *zone, |
158 | const struct nf_conntrack_tuple *tuple) |
159 | { |
160 | unsigned int hash; |
161 | struct { |
162 | struct nf_conntrack_man src; |
163 | u32 net_mix; |
164 | u32 protonum; |
165 | u32 zone; |
166 | } __aligned(SIPHASH_ALIGNMENT) combined; |
167 | |
168 | get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); |
169 | |
170 | memset(&combined, 0, sizeof(combined)); |
171 | |
172 | /* Original src, to ensure we map it consistently if poss. */ |
173 | combined.src = tuple->src; |
174 | combined.net_mix = net_hash_mix(net); |
175 | combined.protonum = tuple->dst.protonum; |
176 | |
177 | /* Zone ID can be used provided its valid for both directions */ |
178 | if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) |
179 | combined.zone = zone->id; |
180 | |
181 | hash = siphash(data: &combined, len: sizeof(combined), key: &nf_nat_hash_rnd); |
182 | |
183 | return reciprocal_scale(val: hash, ep_ro: nf_nat_htable_size); |
184 | } |
185 | |
186 | /* Is this tuple already taken? (not by us) */ |
187 | static int |
188 | nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, |
189 | const struct nf_conn *ignored_conntrack) |
190 | { |
191 | /* Conntrack tracking doesn't keep track of outgoing tuples; only |
192 | * incoming ones. NAT means they don't have a fixed mapping, |
193 | * so we invert the tuple and look for the incoming reply. |
194 | * |
195 | * We could keep a separate hash if this proves too slow. |
196 | */ |
197 | struct nf_conntrack_tuple reply; |
198 | |
199 | nf_ct_invert_tuple(inverse: &reply, orig: tuple); |
200 | return nf_conntrack_tuple_taken(tuple: &reply, ignored_conntrack); |
201 | } |
202 | |
203 | static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) |
204 | { |
205 | static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | |
206 | IPS_DYING; |
207 | static const unsigned long flags_needed = IPS_SRC_NAT; |
208 | enum tcp_conntrack old_state; |
209 | |
210 | old_state = READ_ONCE(ct->proto.tcp.state); |
211 | if (old_state < TCP_CONNTRACK_TIME_WAIT) |
212 | return false; |
213 | |
214 | if (flags & flags_refuse) |
215 | return false; |
216 | |
217 | return (flags & flags_needed) == flags_needed; |
218 | } |
219 | |
220 | /* reverse direction will send packets to new source, so |
221 | * make sure such packets are invalid. |
222 | */ |
223 | static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) |
224 | { |
225 | return (__s32)(new->proto.tcp.seen[0].td_end - |
226 | old->proto.tcp.seen[0].td_end) > 0; |
227 | } |
228 | |
229 | static int |
230 | nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, |
231 | const struct nf_conn *ignored_conntrack, |
232 | unsigned int attempts_left) |
233 | { |
234 | static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; |
235 | struct nf_conntrack_tuple_hash *thash; |
236 | const struct nf_conntrack_zone *zone; |
237 | struct nf_conntrack_tuple reply; |
238 | unsigned long flags; |
239 | struct nf_conn *ct; |
240 | bool taken = true; |
241 | struct net *net; |
242 | |
243 | nf_ct_invert_tuple(inverse: &reply, orig: tuple); |
244 | |
245 | if (attempts_left > NF_NAT_HARDER_THRESH || |
246 | tuple->dst.protonum != IPPROTO_TCP || |
247 | ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) |
248 | return nf_conntrack_tuple_taken(tuple: &reply, ignored_conntrack); |
249 | |
250 | /* :ast few attempts to find a free tcp port. Destructive |
251 | * action: evict colliding if its in timewait state and the |
252 | * tcp sequence number has advanced past the one used by the |
253 | * old entry. |
254 | */ |
255 | net = nf_ct_net(ct: ignored_conntrack); |
256 | zone = nf_ct_zone(ct: ignored_conntrack); |
257 | |
258 | thash = nf_conntrack_find_get(net, zone, tuple: &reply); |
259 | if (!thash) |
260 | return false; |
261 | |
262 | ct = nf_ct_tuplehash_to_ctrack(hash: thash); |
263 | |
264 | if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) |
265 | goto out; |
266 | |
267 | if (WARN_ON_ONCE(ct == ignored_conntrack)) |
268 | goto out; |
269 | |
270 | flags = READ_ONCE(ct->status); |
271 | if (!nf_nat_may_kill(ct, flags)) |
272 | goto out; |
273 | |
274 | if (!nf_seq_has_advanced(old: ct, new: ignored_conntrack)) |
275 | goto out; |
276 | |
277 | /* Even if we can evict do not reuse if entry is offloaded. */ |
278 | if (nf_ct_kill(ct)) |
279 | taken = flags & flags_offload; |
280 | out: |
281 | nf_ct_put(ct); |
282 | return taken; |
283 | } |
284 | |
285 | static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, |
286 | const struct nf_nat_range2 *range) |
287 | { |
288 | if (t->src.l3num == NFPROTO_IPV4) |
289 | return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && |
290 | ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); |
291 | |
292 | return ipv6_addr_cmp(a1: &t->src.u3.in6, a2: &range->min_addr.in6) >= 0 && |
293 | ipv6_addr_cmp(a1: &t->src.u3.in6, a2: &range->max_addr.in6) <= 0; |
294 | } |
295 | |
296 | /* Is the manipable part of the tuple between min and max incl? */ |
297 | static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, |
298 | enum nf_nat_manip_type maniptype, |
299 | const union nf_conntrack_man_proto *min, |
300 | const union nf_conntrack_man_proto *max) |
301 | { |
302 | __be16 port; |
303 | |
304 | switch (tuple->dst.protonum) { |
305 | case IPPROTO_ICMP: |
306 | case IPPROTO_ICMPV6: |
307 | return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && |
308 | ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); |
309 | case IPPROTO_GRE: /* all fall though */ |
310 | case IPPROTO_TCP: |
311 | case IPPROTO_UDP: |
312 | case IPPROTO_UDPLITE: |
313 | case IPPROTO_DCCP: |
314 | case IPPROTO_SCTP: |
315 | if (maniptype == NF_NAT_MANIP_SRC) |
316 | port = tuple->src.u.all; |
317 | else |
318 | port = tuple->dst.u.all; |
319 | |
320 | return ntohs(port) >= ntohs(min->all) && |
321 | ntohs(port) <= ntohs(max->all); |
322 | default: |
323 | return true; |
324 | } |
325 | } |
326 | |
327 | /* If we source map this tuple so reply looks like reply_tuple, will |
328 | * that meet the constraints of range. |
329 | */ |
330 | static int nf_in_range(const struct nf_conntrack_tuple *tuple, |
331 | const struct nf_nat_range2 *range) |
332 | { |
333 | /* If we are supposed to map IPs, then we must be in the |
334 | * range specified, otherwise let this drag us onto a new src IP. |
335 | */ |
336 | if (range->flags & NF_NAT_RANGE_MAP_IPS && |
337 | !nf_nat_inet_in_range(t: tuple, range)) |
338 | return 0; |
339 | |
340 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) |
341 | return 1; |
342 | |
343 | return l4proto_in_range(tuple, maniptype: NF_NAT_MANIP_SRC, |
344 | min: &range->min_proto, max: &range->max_proto); |
345 | } |
346 | |
347 | static inline int |
348 | same_src(const struct nf_conn *ct, |
349 | const struct nf_conntrack_tuple *tuple) |
350 | { |
351 | const struct nf_conntrack_tuple *t; |
352 | |
353 | t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; |
354 | return (t->dst.protonum == tuple->dst.protonum && |
355 | nf_inet_addr_cmp(a1: &t->src.u3, a2: &tuple->src.u3) && |
356 | t->src.u.all == tuple->src.u.all); |
357 | } |
358 | |
359 | /* Only called for SRC manip */ |
360 | static int |
361 | find_appropriate_src(struct net *net, |
362 | const struct nf_conntrack_zone *zone, |
363 | const struct nf_conntrack_tuple *tuple, |
364 | struct nf_conntrack_tuple *result, |
365 | const struct nf_nat_range2 *range) |
366 | { |
367 | unsigned int h = hash_by_src(net, zone, tuple); |
368 | const struct nf_conn *ct; |
369 | |
370 | hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { |
371 | if (same_src(ct, tuple) && |
372 | net_eq(net1: net, net2: nf_ct_net(ct)) && |
373 | nf_ct_zone_equal(a: ct, b: zone, dir: IP_CT_DIR_ORIGINAL)) { |
374 | /* Copy source part from reply tuple. */ |
375 | nf_ct_invert_tuple(inverse: result, |
376 | orig: &ct->tuplehash[IP_CT_DIR_REPLY].tuple); |
377 | result->dst = tuple->dst; |
378 | |
379 | if (nf_in_range(tuple: result, range)) |
380 | return 1; |
381 | } |
382 | } |
383 | return 0; |
384 | } |
385 | |
386 | /* For [FUTURE] fragmentation handling, we want the least-used |
387 | * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus |
388 | * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports |
389 | * 1-65535, we don't do pro-rata allocation based on ports; we choose |
390 | * the ip with the lowest src-ip/dst-ip/proto usage. |
391 | */ |
392 | static void |
393 | find_best_ips_proto(const struct nf_conntrack_zone *zone, |
394 | struct nf_conntrack_tuple *tuple, |
395 | const struct nf_nat_range2 *range, |
396 | const struct nf_conn *ct, |
397 | enum nf_nat_manip_type maniptype) |
398 | { |
399 | union nf_inet_addr *var_ipp; |
400 | unsigned int i, max; |
401 | /* Host order */ |
402 | u32 minip, maxip, j, dist; |
403 | bool full_range; |
404 | |
405 | /* No IP mapping? Do nothing. */ |
406 | if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) |
407 | return; |
408 | |
409 | if (maniptype == NF_NAT_MANIP_SRC) |
410 | var_ipp = &tuple->src.u3; |
411 | else |
412 | var_ipp = &tuple->dst.u3; |
413 | |
414 | /* Fast path: only one choice. */ |
415 | if (nf_inet_addr_cmp(a1: &range->min_addr, a2: &range->max_addr)) { |
416 | *var_ipp = range->min_addr; |
417 | return; |
418 | } |
419 | |
420 | if (nf_ct_l3num(ct) == NFPROTO_IPV4) |
421 | max = sizeof(var_ipp->ip) / sizeof(u32) - 1; |
422 | else |
423 | max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; |
424 | |
425 | /* Hashing source and destination IPs gives a fairly even |
426 | * spread in practice (if there are a small number of IPs |
427 | * involved, there usually aren't that many connections |
428 | * anyway). The consistency means that servers see the same |
429 | * client coming from the same IP (some Internet Banking sites |
430 | * like this), even across reboots. |
431 | */ |
432 | j = jhash2(k: (u32 *)&tuple->src.u3, length: sizeof(tuple->src.u3) / sizeof(u32), |
433 | initval: range->flags & NF_NAT_RANGE_PERSISTENT ? |
434 | 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); |
435 | |
436 | full_range = false; |
437 | for (i = 0; i <= max; i++) { |
438 | /* If first bytes of the address are at the maximum, use the |
439 | * distance. Otherwise use the full range. |
440 | */ |
441 | if (!full_range) { |
442 | minip = ntohl((__force __be32)range->min_addr.all[i]); |
443 | maxip = ntohl((__force __be32)range->max_addr.all[i]); |
444 | dist = maxip - minip + 1; |
445 | } else { |
446 | minip = 0; |
447 | dist = ~0; |
448 | } |
449 | |
450 | var_ipp->all[i] = (__force __u32) |
451 | htonl(minip + reciprocal_scale(j, dist)); |
452 | if (var_ipp->all[i] != range->max_addr.all[i]) |
453 | full_range = true; |
454 | |
455 | if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) |
456 | j ^= (__force u32)tuple->dst.u3.all[i]; |
457 | } |
458 | } |
459 | |
460 | /* Alter the per-proto part of the tuple (depending on maniptype), to |
461 | * give a unique tuple in the given range if possible. |
462 | * |
463 | * Per-protocol part of tuple is initialized to the incoming packet. |
464 | */ |
465 | static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, |
466 | const struct nf_nat_range2 *range, |
467 | enum nf_nat_manip_type maniptype, |
468 | const struct nf_conn *ct) |
469 | { |
470 | unsigned int range_size, min, max, i, attempts; |
471 | __be16 *keyptr; |
472 | u16 off; |
473 | |
474 | switch (tuple->dst.protonum) { |
475 | case IPPROTO_ICMP: |
476 | case IPPROTO_ICMPV6: |
477 | /* id is same for either direction... */ |
478 | keyptr = &tuple->src.u.icmp.id; |
479 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { |
480 | min = 0; |
481 | range_size = 65536; |
482 | } else { |
483 | min = ntohs(range->min_proto.icmp.id); |
484 | range_size = ntohs(range->max_proto.icmp.id) - |
485 | ntohs(range->min_proto.icmp.id) + 1; |
486 | } |
487 | goto find_free_id; |
488 | #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) |
489 | case IPPROTO_GRE: |
490 | /* If there is no master conntrack we are not PPTP, |
491 | do not change tuples */ |
492 | if (!ct->master) |
493 | return; |
494 | |
495 | if (maniptype == NF_NAT_MANIP_SRC) |
496 | keyptr = &tuple->src.u.gre.key; |
497 | else |
498 | keyptr = &tuple->dst.u.gre.key; |
499 | |
500 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { |
501 | min = 1; |
502 | range_size = 65535; |
503 | } else { |
504 | min = ntohs(range->min_proto.gre.key); |
505 | range_size = ntohs(range->max_proto.gre.key) - min + 1; |
506 | } |
507 | goto find_free_id; |
508 | #endif |
509 | case IPPROTO_UDP: |
510 | case IPPROTO_UDPLITE: |
511 | case IPPROTO_TCP: |
512 | case IPPROTO_SCTP: |
513 | case IPPROTO_DCCP: |
514 | if (maniptype == NF_NAT_MANIP_SRC) |
515 | keyptr = &tuple->src.u.all; |
516 | else |
517 | keyptr = &tuple->dst.u.all; |
518 | |
519 | break; |
520 | default: |
521 | return; |
522 | } |
523 | |
524 | /* If no range specified... */ |
525 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { |
526 | /* If it's dst rewrite, can't change port */ |
527 | if (maniptype == NF_NAT_MANIP_DST) |
528 | return; |
529 | |
530 | if (ntohs(*keyptr) < 1024) { |
531 | /* Loose convention: >> 512 is credential passing */ |
532 | if (ntohs(*keyptr) < 512) { |
533 | min = 1; |
534 | range_size = 511 - min + 1; |
535 | } else { |
536 | min = 600; |
537 | range_size = 1023 - min + 1; |
538 | } |
539 | } else { |
540 | min = 1024; |
541 | range_size = 65535 - 1024 + 1; |
542 | } |
543 | } else { |
544 | min = ntohs(range->min_proto.all); |
545 | max = ntohs(range->max_proto.all); |
546 | if (unlikely(max < min)) |
547 | swap(max, min); |
548 | range_size = max - min + 1; |
549 | } |
550 | |
551 | find_free_id: |
552 | if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) |
553 | off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); |
554 | else |
555 | off = get_random_u16(); |
556 | |
557 | attempts = range_size; |
558 | if (attempts > NF_NAT_MAX_ATTEMPTS) |
559 | attempts = NF_NAT_MAX_ATTEMPTS; |
560 | |
561 | /* We are in softirq; doing a search of the entire range risks |
562 | * soft lockup when all tuples are already used. |
563 | * |
564 | * If we can't find any free port from first offset, pick a new |
565 | * one and try again, with ever smaller search window. |
566 | */ |
567 | another_round: |
568 | for (i = 0; i < attempts; i++, off++) { |
569 | *keyptr = htons(min + off % range_size); |
570 | if (!nf_nat_used_tuple_harder(tuple, ignored_conntrack: ct, attempts_left: attempts - i)) |
571 | return; |
572 | } |
573 | |
574 | if (attempts >= range_size || attempts < 16) |
575 | return; |
576 | attempts /= 2; |
577 | off = get_random_u16(); |
578 | goto another_round; |
579 | } |
580 | |
581 | /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, |
582 | * we change the source to map into the range. For NF_INET_PRE_ROUTING |
583 | * and NF_INET_LOCAL_OUT, we change the destination to map into the |
584 | * range. It might not be possible to get a unique tuple, but we try. |
585 | * At worst (or if we race), we will end up with a final duplicate in |
586 | * __nf_conntrack_confirm and drop the packet. */ |
587 | static void |
588 | get_unique_tuple(struct nf_conntrack_tuple *tuple, |
589 | const struct nf_conntrack_tuple *orig_tuple, |
590 | const struct nf_nat_range2 *range, |
591 | struct nf_conn *ct, |
592 | enum nf_nat_manip_type maniptype) |
593 | { |
594 | const struct nf_conntrack_zone *zone; |
595 | struct net *net = nf_ct_net(ct); |
596 | |
597 | zone = nf_ct_zone(ct); |
598 | |
599 | /* 1) If this srcip/proto/src-proto-part is currently mapped, |
600 | * and that same mapping gives a unique tuple within the given |
601 | * range, use that. |
602 | * |
603 | * This is only required for source (ie. NAT/masq) mappings. |
604 | * So far, we don't do local source mappings, so multiple |
605 | * manips not an issue. |
606 | */ |
607 | if (maniptype == NF_NAT_MANIP_SRC && |
608 | !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { |
609 | /* try the original tuple first */ |
610 | if (nf_in_range(tuple: orig_tuple, range)) { |
611 | if (!nf_nat_used_tuple(tuple: orig_tuple, ignored_conntrack: ct)) { |
612 | *tuple = *orig_tuple; |
613 | return; |
614 | } |
615 | } else if (find_appropriate_src(net, zone, |
616 | tuple: orig_tuple, result: tuple, range)) { |
617 | pr_debug("get_unique_tuple: Found current src map\n" ); |
618 | if (!nf_nat_used_tuple(tuple, ignored_conntrack: ct)) |
619 | return; |
620 | } |
621 | } |
622 | |
623 | /* 2) Select the least-used IP/proto combination in the given range */ |
624 | *tuple = *orig_tuple; |
625 | find_best_ips_proto(zone, tuple, range, ct, maniptype); |
626 | |
627 | /* 3) The per-protocol part of the manip is made to map into |
628 | * the range to make a unique tuple. |
629 | */ |
630 | |
631 | /* Only bother mapping if it's not already in range and unique */ |
632 | if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { |
633 | if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { |
634 | if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && |
635 | l4proto_in_range(tuple, maniptype, |
636 | min: &range->min_proto, |
637 | max: &range->max_proto) && |
638 | (range->min_proto.all == range->max_proto.all || |
639 | !nf_nat_used_tuple(tuple, ignored_conntrack: ct))) |
640 | return; |
641 | } else if (!nf_nat_used_tuple(tuple, ignored_conntrack: ct)) { |
642 | return; |
643 | } |
644 | } |
645 | |
646 | /* Last chance: get protocol to try to obtain unique tuple. */ |
647 | nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); |
648 | } |
649 | |
650 | struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) |
651 | { |
652 | struct nf_conn_nat *nat = nfct_nat(ct); |
653 | if (nat) |
654 | return nat; |
655 | |
656 | if (!nf_ct_is_confirmed(ct)) |
657 | nat = nf_ct_ext_add(ct, id: NF_CT_EXT_NAT, GFP_ATOMIC); |
658 | |
659 | return nat; |
660 | } |
661 | EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); |
662 | |
663 | unsigned int |
664 | nf_nat_setup_info(struct nf_conn *ct, |
665 | const struct nf_nat_range2 *range, |
666 | enum nf_nat_manip_type maniptype) |
667 | { |
668 | struct net *net = nf_ct_net(ct); |
669 | struct nf_conntrack_tuple curr_tuple, new_tuple; |
670 | |
671 | /* Can't setup nat info for confirmed ct. */ |
672 | if (nf_ct_is_confirmed(ct)) |
673 | return NF_ACCEPT; |
674 | |
675 | WARN_ON(maniptype != NF_NAT_MANIP_SRC && |
676 | maniptype != NF_NAT_MANIP_DST); |
677 | |
678 | if (WARN_ON(nf_nat_initialized(ct, maniptype))) |
679 | return NF_DROP; |
680 | |
681 | /* What we've got will look like inverse of reply. Normally |
682 | * this is what is in the conntrack, except for prior |
683 | * manipulations (future optimization: if num_manips == 0, |
684 | * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) |
685 | */ |
686 | nf_ct_invert_tuple(inverse: &curr_tuple, |
687 | orig: &ct->tuplehash[IP_CT_DIR_REPLY].tuple); |
688 | |
689 | get_unique_tuple(tuple: &new_tuple, orig_tuple: &curr_tuple, range, ct, maniptype); |
690 | |
691 | if (!nf_ct_tuple_equal(t1: &new_tuple, t2: &curr_tuple)) { |
692 | struct nf_conntrack_tuple reply; |
693 | |
694 | /* Alter conntrack table so will recognize replies. */ |
695 | nf_ct_invert_tuple(inverse: &reply, orig: &new_tuple); |
696 | nf_conntrack_alter_reply(ct, newreply: &reply); |
697 | |
698 | /* Non-atomic: we own this at the moment. */ |
699 | if (maniptype == NF_NAT_MANIP_SRC) |
700 | ct->status |= IPS_SRC_NAT; |
701 | else |
702 | ct->status |= IPS_DST_NAT; |
703 | |
704 | if (nfct_help(ct) && !nfct_seqadj(ct)) |
705 | if (!nfct_seqadj_ext_add(ct)) |
706 | return NF_DROP; |
707 | } |
708 | |
709 | if (maniptype == NF_NAT_MANIP_SRC) { |
710 | unsigned int srchash; |
711 | spinlock_t *lock; |
712 | |
713 | srchash = hash_by_src(net, zone: nf_ct_zone(ct), |
714 | tuple: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
715 | lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; |
716 | spin_lock_bh(lock); |
717 | hlist_add_head_rcu(n: &ct->nat_bysource, |
718 | h: &nf_nat_bysource[srchash]); |
719 | spin_unlock_bh(lock); |
720 | } |
721 | |
722 | /* It's done. */ |
723 | if (maniptype == NF_NAT_MANIP_DST) |
724 | ct->status |= IPS_DST_NAT_DONE; |
725 | else |
726 | ct->status |= IPS_SRC_NAT_DONE; |
727 | |
728 | return NF_ACCEPT; |
729 | } |
730 | EXPORT_SYMBOL(nf_nat_setup_info); |
731 | |
732 | static unsigned int |
733 | __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) |
734 | { |
735 | /* Force range to this IP; let proto decide mapping for |
736 | * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). |
737 | * Use reply in case it's already been mangled (eg local packet). |
738 | */ |
739 | union nf_inet_addr ip = |
740 | (manip == NF_NAT_MANIP_SRC ? |
741 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : |
742 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); |
743 | struct nf_nat_range2 range = { |
744 | .flags = NF_NAT_RANGE_MAP_IPS, |
745 | .min_addr = ip, |
746 | .max_addr = ip, |
747 | }; |
748 | return nf_nat_setup_info(ct, &range, manip); |
749 | } |
750 | |
751 | unsigned int |
752 | nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) |
753 | { |
754 | return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); |
755 | } |
756 | EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); |
757 | |
758 | /* Do packet manipulations according to nf_nat_setup_info. */ |
759 | unsigned int nf_nat_packet(struct nf_conn *ct, |
760 | enum ip_conntrack_info ctinfo, |
761 | unsigned int hooknum, |
762 | struct sk_buff *skb) |
763 | { |
764 | enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); |
765 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); |
766 | unsigned int verdict = NF_ACCEPT; |
767 | unsigned long statusbit; |
768 | |
769 | if (mtype == NF_NAT_MANIP_SRC) |
770 | statusbit = IPS_SRC_NAT; |
771 | else |
772 | statusbit = IPS_DST_NAT; |
773 | |
774 | /* Invert if this is reply dir. */ |
775 | if (dir == IP_CT_DIR_REPLY) |
776 | statusbit ^= IPS_NAT_MASK; |
777 | |
778 | /* Non-atomic: these bits don't change. */ |
779 | if (ct->status & statusbit) |
780 | verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); |
781 | |
782 | return verdict; |
783 | } |
784 | EXPORT_SYMBOL_GPL(nf_nat_packet); |
785 | |
786 | static bool in_vrf_postrouting(const struct nf_hook_state *state) |
787 | { |
788 | #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) |
789 | if (state->hook == NF_INET_POST_ROUTING && |
790 | netif_is_l3_master(dev: state->out)) |
791 | return true; |
792 | #endif |
793 | return false; |
794 | } |
795 | |
796 | unsigned int |
797 | nf_nat_inet_fn(void *priv, struct sk_buff *skb, |
798 | const struct nf_hook_state *state) |
799 | { |
800 | struct nf_conn *ct; |
801 | enum ip_conntrack_info ctinfo; |
802 | struct nf_conn_nat *nat; |
803 | /* maniptype == SRC for postrouting. */ |
804 | enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); |
805 | |
806 | ct = nf_ct_get(skb, ctinfo: &ctinfo); |
807 | /* Can't track? It's not due to stress, or conntrack would |
808 | * have dropped it. Hence it's the user's responsibilty to |
809 | * packet filter it out, or implement conntrack/NAT for that |
810 | * protocol. 8) --RR |
811 | */ |
812 | if (!ct || in_vrf_postrouting(state)) |
813 | return NF_ACCEPT; |
814 | |
815 | nat = nfct_nat(ct); |
816 | |
817 | switch (ctinfo) { |
818 | case IP_CT_RELATED: |
819 | case IP_CT_RELATED_REPLY: |
820 | /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ |
821 | case IP_CT_NEW: |
822 | /* Seen it before? This can happen for loopback, retrans, |
823 | * or local packets. |
824 | */ |
825 | if (!nf_nat_initialized(ct, manip: maniptype)) { |
826 | struct nf_nat_lookup_hook_priv *lpriv = priv; |
827 | struct nf_hook_entries *e = rcu_dereference(lpriv->entries); |
828 | unsigned int ret; |
829 | int i; |
830 | |
831 | if (!e) |
832 | goto null_bind; |
833 | |
834 | for (i = 0; i < e->num_hook_entries; i++) { |
835 | ret = e->hooks[i].hook(e->hooks[i].priv, skb, |
836 | state); |
837 | if (ret != NF_ACCEPT) |
838 | return ret; |
839 | if (nf_nat_initialized(ct, manip: maniptype)) |
840 | goto do_nat; |
841 | } |
842 | null_bind: |
843 | ret = nf_nat_alloc_null_binding(ct, state->hook); |
844 | if (ret != NF_ACCEPT) |
845 | return ret; |
846 | } else { |
847 | pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n" , |
848 | maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST" , |
849 | ct, ct->status); |
850 | if (nf_nat_oif_changed(hooknum: state->hook, ctinfo, nat, |
851 | out: state->out)) |
852 | goto oif_changed; |
853 | } |
854 | break; |
855 | default: |
856 | /* ESTABLISHED */ |
857 | WARN_ON(ctinfo != IP_CT_ESTABLISHED && |
858 | ctinfo != IP_CT_ESTABLISHED_REPLY); |
859 | if (nf_nat_oif_changed(hooknum: state->hook, ctinfo, nat, out: state->out)) |
860 | goto oif_changed; |
861 | } |
862 | do_nat: |
863 | return nf_nat_packet(ct, ctinfo, state->hook, skb); |
864 | |
865 | oif_changed: |
866 | nf_ct_kill_acct(ct, ctinfo, skb); |
867 | return NF_DROP; |
868 | } |
869 | EXPORT_SYMBOL_GPL(nf_nat_inet_fn); |
870 | |
871 | struct nf_nat_proto_clean { |
872 | u8 l3proto; |
873 | u8 l4proto; |
874 | }; |
875 | |
876 | /* kill conntracks with affected NAT section */ |
877 | static int nf_nat_proto_remove(struct nf_conn *i, void *data) |
878 | { |
879 | const struct nf_nat_proto_clean *clean = data; |
880 | |
881 | if ((clean->l3proto && nf_ct_l3num(ct: i) != clean->l3proto) || |
882 | (clean->l4proto && nf_ct_protonum(ct: i) != clean->l4proto)) |
883 | return 0; |
884 | |
885 | return i->status & IPS_NAT_MASK ? 1 : 0; |
886 | } |
887 | |
888 | static void nf_nat_cleanup_conntrack(struct nf_conn *ct) |
889 | { |
890 | unsigned int h; |
891 | |
892 | h = hash_by_src(net: nf_ct_net(ct), zone: nf_ct_zone(ct), tuple: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
893 | spin_lock_bh(lock: &nf_nat_locks[h % CONNTRACK_LOCKS]); |
894 | hlist_del_rcu(n: &ct->nat_bysource); |
895 | spin_unlock_bh(lock: &nf_nat_locks[h % CONNTRACK_LOCKS]); |
896 | } |
897 | |
898 | static int nf_nat_proto_clean(struct nf_conn *ct, void *data) |
899 | { |
900 | if (nf_nat_proto_remove(i: ct, data)) |
901 | return 1; |
902 | |
903 | /* This module is being removed and conntrack has nat null binding. |
904 | * Remove it from bysource hash, as the table will be freed soon. |
905 | * |
906 | * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() |
907 | * will delete entry from already-freed table. |
908 | */ |
909 | if (test_and_clear_bit(nr: IPS_SRC_NAT_DONE_BIT, addr: &ct->status)) |
910 | nf_nat_cleanup_conntrack(ct); |
911 | |
912 | /* don't delete conntrack. Although that would make things a lot |
913 | * simpler, we'd end up flushing all conntracks on nat rmmod. |
914 | */ |
915 | return 0; |
916 | } |
917 | |
918 | #if IS_ENABLED(CONFIG_NF_CT_NETLINK) |
919 | |
920 | #include <linux/netfilter/nfnetlink.h> |
921 | #include <linux/netfilter/nfnetlink_conntrack.h> |
922 | |
923 | static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { |
924 | [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, |
925 | [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, |
926 | }; |
927 | |
928 | static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], |
929 | struct nf_nat_range2 *range) |
930 | { |
931 | if (tb[CTA_PROTONAT_PORT_MIN]) { |
932 | range->min_proto.all = nla_get_be16(nla: tb[CTA_PROTONAT_PORT_MIN]); |
933 | range->max_proto.all = range->min_proto.all; |
934 | range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; |
935 | } |
936 | if (tb[CTA_PROTONAT_PORT_MAX]) { |
937 | range->max_proto.all = nla_get_be16(nla: tb[CTA_PROTONAT_PORT_MAX]); |
938 | range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; |
939 | } |
940 | return 0; |
941 | } |
942 | |
943 | static int nfnetlink_parse_nat_proto(struct nlattr *attr, |
944 | const struct nf_conn *ct, |
945 | struct nf_nat_range2 *range) |
946 | { |
947 | struct nlattr *tb[CTA_PROTONAT_MAX+1]; |
948 | int err; |
949 | |
950 | err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, nla: attr, |
951 | policy: protonat_nla_policy, NULL); |
952 | if (err < 0) |
953 | return err; |
954 | |
955 | return nf_nat_l4proto_nlattr_to_range(tb, range); |
956 | } |
957 | |
958 | static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { |
959 | [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, |
960 | [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, |
961 | [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, |
962 | [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, |
963 | [CTA_NAT_PROTO] = { .type = NLA_NESTED }, |
964 | }; |
965 | |
966 | static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], |
967 | struct nf_nat_range2 *range) |
968 | { |
969 | if (tb[CTA_NAT_V4_MINIP]) { |
970 | range->min_addr.ip = nla_get_be32(nla: tb[CTA_NAT_V4_MINIP]); |
971 | range->flags |= NF_NAT_RANGE_MAP_IPS; |
972 | } |
973 | |
974 | if (tb[CTA_NAT_V4_MAXIP]) |
975 | range->max_addr.ip = nla_get_be32(nla: tb[CTA_NAT_V4_MAXIP]); |
976 | else |
977 | range->max_addr.ip = range->min_addr.ip; |
978 | |
979 | return 0; |
980 | } |
981 | |
982 | static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], |
983 | struct nf_nat_range2 *range) |
984 | { |
985 | if (tb[CTA_NAT_V6_MINIP]) { |
986 | nla_memcpy(dest: &range->min_addr.ip6, src: tb[CTA_NAT_V6_MINIP], |
987 | count: sizeof(struct in6_addr)); |
988 | range->flags |= NF_NAT_RANGE_MAP_IPS; |
989 | } |
990 | |
991 | if (tb[CTA_NAT_V6_MAXIP]) |
992 | nla_memcpy(dest: &range->max_addr.ip6, src: tb[CTA_NAT_V6_MAXIP], |
993 | count: sizeof(struct in6_addr)); |
994 | else |
995 | range->max_addr = range->min_addr; |
996 | |
997 | return 0; |
998 | } |
999 | |
1000 | static int |
1001 | nfnetlink_parse_nat(const struct nlattr *nat, |
1002 | const struct nf_conn *ct, struct nf_nat_range2 *range) |
1003 | { |
1004 | struct nlattr *tb[CTA_NAT_MAX+1]; |
1005 | int err; |
1006 | |
1007 | memset(range, 0, sizeof(*range)); |
1008 | |
1009 | err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nla: nat, |
1010 | policy: nat_nla_policy, NULL); |
1011 | if (err < 0) |
1012 | return err; |
1013 | |
1014 | switch (nf_ct_l3num(ct)) { |
1015 | case NFPROTO_IPV4: |
1016 | err = nf_nat_ipv4_nlattr_to_range(tb, range); |
1017 | break; |
1018 | case NFPROTO_IPV6: |
1019 | err = nf_nat_ipv6_nlattr_to_range(tb, range); |
1020 | break; |
1021 | default: |
1022 | err = -EPROTONOSUPPORT; |
1023 | break; |
1024 | } |
1025 | |
1026 | if (err) |
1027 | return err; |
1028 | |
1029 | if (!tb[CTA_NAT_PROTO]) |
1030 | return 0; |
1031 | |
1032 | return nfnetlink_parse_nat_proto(attr: tb[CTA_NAT_PROTO], ct, range); |
1033 | } |
1034 | |
1035 | /* This function is called under rcu_read_lock() */ |
1036 | static int |
1037 | nfnetlink_parse_nat_setup(struct nf_conn *ct, |
1038 | enum nf_nat_manip_type manip, |
1039 | const struct nlattr *attr) |
1040 | { |
1041 | struct nf_nat_range2 range; |
1042 | int err; |
1043 | |
1044 | /* Should not happen, restricted to creating new conntracks |
1045 | * via ctnetlink. |
1046 | */ |
1047 | if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) |
1048 | return -EEXIST; |
1049 | |
1050 | /* No NAT information has been passed, allocate the null-binding */ |
1051 | if (attr == NULL) |
1052 | return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; |
1053 | |
1054 | err = nfnetlink_parse_nat(nat: attr, ct, range: &range); |
1055 | if (err < 0) |
1056 | return err; |
1057 | |
1058 | return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; |
1059 | } |
1060 | #else |
1061 | static int |
1062 | nfnetlink_parse_nat_setup(struct nf_conn *ct, |
1063 | enum nf_nat_manip_type manip, |
1064 | const struct nlattr *attr) |
1065 | { |
1066 | return -EOPNOTSUPP; |
1067 | } |
1068 | #endif |
1069 | |
1070 | static struct nf_ct_helper_expectfn follow_master_nat = { |
1071 | .name = "nat-follow-master" , |
1072 | .expectfn = nf_nat_follow_master, |
1073 | }; |
1074 | |
1075 | int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, |
1076 | const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) |
1077 | { |
1078 | struct nat_net *nat_net = net_generic(net, id: nat_net_id); |
1079 | struct nf_nat_hooks_net *nat_proto_net; |
1080 | struct nf_nat_lookup_hook_priv *priv; |
1081 | unsigned int hooknum = ops->hooknum; |
1082 | struct nf_hook_ops *nat_ops; |
1083 | int i, ret; |
1084 | |
1085 | if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) |
1086 | return -EINVAL; |
1087 | |
1088 | nat_proto_net = &nat_net->nat_proto_net[pf]; |
1089 | |
1090 | for (i = 0; i < ops_count; i++) { |
1091 | if (orig_nat_ops[i].hooknum == hooknum) { |
1092 | hooknum = i; |
1093 | break; |
1094 | } |
1095 | } |
1096 | |
1097 | if (WARN_ON_ONCE(i == ops_count)) |
1098 | return -EINVAL; |
1099 | |
1100 | mutex_lock(&nf_nat_proto_mutex); |
1101 | if (!nat_proto_net->nat_hook_ops) { |
1102 | WARN_ON(nat_proto_net->users != 0); |
1103 | |
1104 | nat_ops = kmemdup(p: orig_nat_ops, size: sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); |
1105 | if (!nat_ops) { |
1106 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1107 | return -ENOMEM; |
1108 | } |
1109 | |
1110 | for (i = 0; i < ops_count; i++) { |
1111 | priv = kzalloc(size: sizeof(*priv), GFP_KERNEL); |
1112 | if (priv) { |
1113 | nat_ops[i].priv = priv; |
1114 | continue; |
1115 | } |
1116 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1117 | while (i) |
1118 | kfree(objp: nat_ops[--i].priv); |
1119 | kfree(objp: nat_ops); |
1120 | return -ENOMEM; |
1121 | } |
1122 | |
1123 | ret = nf_register_net_hooks(net, reg: nat_ops, n: ops_count); |
1124 | if (ret < 0) { |
1125 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1126 | for (i = 0; i < ops_count; i++) |
1127 | kfree(objp: nat_ops[i].priv); |
1128 | kfree(objp: nat_ops); |
1129 | return ret; |
1130 | } |
1131 | |
1132 | nat_proto_net->nat_hook_ops = nat_ops; |
1133 | } |
1134 | |
1135 | nat_ops = nat_proto_net->nat_hook_ops; |
1136 | priv = nat_ops[hooknum].priv; |
1137 | if (WARN_ON_ONCE(!priv)) { |
1138 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1139 | return -EOPNOTSUPP; |
1140 | } |
1141 | |
1142 | ret = nf_hook_entries_insert_raw(pp: &priv->entries, reg: ops); |
1143 | if (ret == 0) |
1144 | nat_proto_net->users++; |
1145 | |
1146 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1147 | return ret; |
1148 | } |
1149 | |
1150 | void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, |
1151 | unsigned int ops_count) |
1152 | { |
1153 | struct nat_net *nat_net = net_generic(net, id: nat_net_id); |
1154 | struct nf_nat_hooks_net *nat_proto_net; |
1155 | struct nf_nat_lookup_hook_priv *priv; |
1156 | struct nf_hook_ops *nat_ops; |
1157 | int hooknum = ops->hooknum; |
1158 | int i; |
1159 | |
1160 | if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) |
1161 | return; |
1162 | |
1163 | nat_proto_net = &nat_net->nat_proto_net[pf]; |
1164 | |
1165 | mutex_lock(&nf_nat_proto_mutex); |
1166 | if (WARN_ON(nat_proto_net->users == 0)) |
1167 | goto unlock; |
1168 | |
1169 | nat_proto_net->users--; |
1170 | |
1171 | nat_ops = nat_proto_net->nat_hook_ops; |
1172 | for (i = 0; i < ops_count; i++) { |
1173 | if (nat_ops[i].hooknum == hooknum) { |
1174 | hooknum = i; |
1175 | break; |
1176 | } |
1177 | } |
1178 | if (WARN_ON_ONCE(i == ops_count)) |
1179 | goto unlock; |
1180 | priv = nat_ops[hooknum].priv; |
1181 | nf_hook_entries_delete_raw(pp: &priv->entries, reg: ops); |
1182 | |
1183 | if (nat_proto_net->users == 0) { |
1184 | nf_unregister_net_hooks(net, reg: nat_ops, n: ops_count); |
1185 | |
1186 | for (i = 0; i < ops_count; i++) { |
1187 | priv = nat_ops[i].priv; |
1188 | kfree_rcu(priv, rcu_head); |
1189 | } |
1190 | |
1191 | nat_proto_net->nat_hook_ops = NULL; |
1192 | kfree(objp: nat_ops); |
1193 | } |
1194 | unlock: |
1195 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1196 | } |
1197 | |
1198 | static struct pernet_operations nat_net_ops = { |
1199 | .id = &nat_net_id, |
1200 | .size = sizeof(struct nat_net), |
1201 | }; |
1202 | |
1203 | static const struct nf_nat_hook nat_hook = { |
1204 | .parse_nat_setup = nfnetlink_parse_nat_setup, |
1205 | #ifdef CONFIG_XFRM |
1206 | .decode_session = __nf_nat_decode_session, |
1207 | #endif |
1208 | .manip_pkt = nf_nat_manip_pkt, |
1209 | .remove_nat_bysrc = nf_nat_cleanup_conntrack, |
1210 | }; |
1211 | |
1212 | static int __init nf_nat_init(void) |
1213 | { |
1214 | int ret, i; |
1215 | |
1216 | /* Leave them the same for the moment. */ |
1217 | nf_nat_htable_size = nf_conntrack_htable_size; |
1218 | if (nf_nat_htable_size < CONNTRACK_LOCKS) |
1219 | nf_nat_htable_size = CONNTRACK_LOCKS; |
1220 | |
1221 | nf_nat_bysource = nf_ct_alloc_hashtable(sizep: &nf_nat_htable_size, nulls: 0); |
1222 | if (!nf_nat_bysource) |
1223 | return -ENOMEM; |
1224 | |
1225 | for (i = 0; i < CONNTRACK_LOCKS; i++) |
1226 | spin_lock_init(&nf_nat_locks[i]); |
1227 | |
1228 | ret = register_pernet_subsys(&nat_net_ops); |
1229 | if (ret < 0) { |
1230 | kvfree(addr: nf_nat_bysource); |
1231 | return ret; |
1232 | } |
1233 | |
1234 | nf_ct_helper_expectfn_register(n: &follow_master_nat); |
1235 | |
1236 | WARN_ON(nf_nat_hook != NULL); |
1237 | RCU_INIT_POINTER(nf_nat_hook, &nat_hook); |
1238 | |
1239 | ret = register_nf_nat_bpf(); |
1240 | if (ret < 0) { |
1241 | RCU_INIT_POINTER(nf_nat_hook, NULL); |
1242 | nf_ct_helper_expectfn_unregister(n: &follow_master_nat); |
1243 | synchronize_net(); |
1244 | unregister_pernet_subsys(&nat_net_ops); |
1245 | kvfree(addr: nf_nat_bysource); |
1246 | } |
1247 | |
1248 | return ret; |
1249 | } |
1250 | |
1251 | static void __exit nf_nat_cleanup(void) |
1252 | { |
1253 | struct nf_nat_proto_clean clean = {}; |
1254 | |
1255 | nf_ct_iterate_destroy(iter: nf_nat_proto_clean, data: &clean); |
1256 | |
1257 | nf_ct_helper_expectfn_unregister(n: &follow_master_nat); |
1258 | RCU_INIT_POINTER(nf_nat_hook, NULL); |
1259 | |
1260 | synchronize_net(); |
1261 | kvfree(addr: nf_nat_bysource); |
1262 | unregister_pernet_subsys(&nat_net_ops); |
1263 | } |
1264 | |
1265 | MODULE_LICENSE("GPL" ); |
1266 | |
1267 | module_init(nf_nat_init); |
1268 | module_exit(nf_nat_cleanup); |
1269 | |