1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | |
3 | /* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges |
4 | * |
5 | * Copyright (c) 2019-2020 Red Hat GmbH |
6 | * |
7 | * Author: Stefano Brivio <sbrivio@redhat.com> |
8 | */ |
9 | |
10 | /** |
11 | * DOC: Theory of Operation |
12 | * |
13 | * |
14 | * Problem |
15 | * ------- |
16 | * |
17 | * Match packet bytes against entries composed of ranged or non-ranged packet |
18 | * field specifiers, mapping them to arbitrary references. For example: |
19 | * |
20 | * :: |
21 | * |
22 | * --- fields ---> |
23 | * | [net],[port],[net]... => [reference] |
24 | * entries [net],[port],[net]... => [reference] |
25 | * | [net],[port],[net]... => [reference] |
26 | * V ... |
27 | * |
28 | * where [net] fields can be IP ranges or netmasks, and [port] fields are port |
29 | * ranges. Arbitrary packet fields can be matched. |
30 | * |
31 | * |
32 | * Algorithm Overview |
33 | * ------------------ |
34 | * |
35 | * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally |
36 | * relies on the consideration that every contiguous range in a space of b bits |
37 | * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], |
38 | * as also illustrated in Section 9 of [Kogan 2014]. |
39 | * |
40 | * Classification against a number of entries, that require matching given bits |
41 | * of a packet field, is performed by grouping those bits in sets of arbitrary |
42 | * size, and classifying packet bits one group at a time. |
43 | * |
44 | * Example: |
45 | * to match the source port (16 bits) of a packet, we can divide those 16 bits |
46 | * in 4 groups of 4 bits each. Given the entry: |
47 | * 0000 0001 0101 1001 |
48 | * and a packet with source port: |
49 | * 0000 0001 1010 1001 |
50 | * first and second groups match, but the third doesn't. We conclude that the |
51 | * packet doesn't match the given entry. |
52 | * |
53 | * Translate the set to a sequence of lookup tables, one per field. Each table |
54 | * has two dimensions: bit groups to be matched for a single packet field, and |
55 | * all the possible values of said groups (buckets). Input entries are |
56 | * represented as one or more rules, depending on the number of composing |
57 | * netmasks for the given field specifier, and a group match is indicated as a |
58 | * set bit, with number corresponding to the rule index, in all the buckets |
59 | * whose value matches the entry for a given group. |
60 | * |
61 | * Rules are mapped between fields through an array of x, n pairs, with each |
62 | * item mapping a matched rule to one or more rules. The position of the pair in |
63 | * the array indicates the matched rule to be mapped to the next field, x |
64 | * indicates the first rule index in the next field, and n the amount of |
65 | * next-field rules the current rule maps to. |
66 | * |
67 | * The mapping array for the last field maps to the desired references. |
68 | * |
69 | * To match, we perform table lookups using the values of grouped packet bits, |
70 | * and use a sequence of bitwise operations to progressively evaluate rule |
71 | * matching. |
72 | * |
73 | * A stand-alone, reference implementation, also including notes about possible |
74 | * future optimisations, is available at: |
75 | * https://pipapo.lameexcu.se/ |
76 | * |
77 | * Insertion |
78 | * --------- |
79 | * |
80 | * - For each packet field: |
81 | * |
82 | * - divide the b packet bits we want to classify into groups of size t, |
83 | * obtaining ceil(b / t) groups |
84 | * |
85 | * Example: match on destination IP address, with t = 4: 32 bits, 8 groups |
86 | * of 4 bits each |
87 | * |
88 | * - allocate a lookup table with one column ("bucket") for each possible |
89 | * value of a group, and with one row for each group |
90 | * |
91 | * Example: 8 groups, 2^4 buckets: |
92 | * |
93 | * :: |
94 | * |
95 | * bucket |
96 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
97 | * 0 |
98 | * 1 |
99 | * 2 |
100 | * 3 |
101 | * 4 |
102 | * 5 |
103 | * 6 |
104 | * 7 |
105 | * |
106 | * - map the bits we want to classify for the current field, for a given |
107 | * entry, to a single rule for non-ranged and netmask set items, and to one |
108 | * or multiple rules for ranges. Ranges are expanded to composing netmasks |
109 | * by pipapo_expand(). |
110 | * |
111 | * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 |
112 | * - rule #0: 10.0.0.5 |
113 | * - rule #1: 192.168.1.0/24 |
114 | * - rule #2: 192.168.2.0/31 |
115 | * |
116 | * - insert references to the rules in the lookup table, selecting buckets |
117 | * according to bit values of a rule in the given group. This is done by |
118 | * pipapo_insert(). |
119 | * |
120 | * Example: given: |
121 | * - rule #0: 10.0.0.5 mapping to buckets |
122 | * < 0 10 0 0 0 0 0 5 > |
123 | * - rule #1: 192.168.1.0/24 mapping to buckets |
124 | * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > |
125 | * - rule #2: 192.168.2.0/31 mapping to buckets |
126 | * < 12 0 10 8 0 2 0 < 0..1 > > |
127 | * |
128 | * these bits are set in the lookup table: |
129 | * |
130 | * :: |
131 | * |
132 | * bucket |
133 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
134 | * 0 0 1,2 |
135 | * 1 1,2 0 |
136 | * 2 0 1,2 |
137 | * 3 0 1,2 |
138 | * 4 0,1,2 |
139 | * 5 0 1 2 |
140 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 |
141 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 |
142 | * |
143 | * - if this is not the last field in the set, fill a mapping array that maps |
144 | * rules from the lookup table to rules belonging to the same entry in |
145 | * the next lookup table, done by pipapo_map(). |
146 | * |
147 | * Note that as rules map to contiguous ranges of rules, given how netmask |
148 | * expansion and insertion is performed, &union nft_pipapo_map_bucket stores |
149 | * this information as pairs of first rule index, rule count. |
150 | * |
151 | * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, |
152 | * given lookup table #0 for field 0 (see example above): |
153 | * |
154 | * :: |
155 | * |
156 | * bucket |
157 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
158 | * 0 0 1,2 |
159 | * 1 1,2 0 |
160 | * 2 0 1,2 |
161 | * 3 0 1,2 |
162 | * 4 0,1,2 |
163 | * 5 0 1 2 |
164 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 |
165 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 |
166 | * |
167 | * and lookup table #1 for field 1 with: |
168 | * - rule #0: 1024 mapping to buckets |
169 | * < 0 0 4 0 > |
170 | * - rule #1: 2048 mapping to buckets |
171 | * < 0 0 5 0 > |
172 | * |
173 | * :: |
174 | * |
175 | * bucket |
176 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
177 | * 0 0,1 |
178 | * 1 0,1 |
179 | * 2 0 1 |
180 | * 3 0,1 |
181 | * |
182 | * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 |
183 | * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 |
184 | * (rules #1, #2) to 2048 in lookup table #2 (rule #1): |
185 | * |
186 | * :: |
187 | * |
188 | * rule indices in current field: 0 1 2 |
189 | * map to rules in next field: 0 1 1 |
190 | * |
191 | * - if this is the last field in the set, fill a mapping array that maps |
192 | * rules from the last lookup table to element pointers, also done by |
193 | * pipapo_map(). |
194 | * |
195 | * Note that, in this implementation, we have two elements (start, end) for |
196 | * each entry. The pointer to the end element is stored in this array, and |
197 | * the pointer to the start element is linked from it. |
198 | * |
199 | * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem |
200 | * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. |
201 | * From the rules of lookup table #1 as mapped above: |
202 | * |
203 | * :: |
204 | * |
205 | * rule indices in last field: 0 1 |
206 | * map to elements: 0x66 0x42 |
207 | * |
208 | * |
209 | * Matching |
210 | * -------- |
211 | * |
212 | * We use a result bitmap, with the size of a single lookup table bucket, to |
213 | * represent the matching state that applies at every algorithm step. This is |
214 | * done by pipapo_lookup(). |
215 | * |
216 | * - For each packet field: |
217 | * |
218 | * - start with an all-ones result bitmap (res_map in pipapo_lookup()) |
219 | * |
220 | * - perform a lookup into the table corresponding to the current field, |
221 | * for each group, and at every group, AND the current result bitmap with |
222 | * the value from the lookup table bucket |
223 | * |
224 | * :: |
225 | * |
226 | * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from |
227 | * insertion examples. |
228 | * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for |
229 | * convenience in this example. Initial result bitmap is 0xff, the steps |
230 | * below show the value of the result bitmap after each group is processed: |
231 | * |
232 | * bucket |
233 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
234 | * 0 0 1,2 |
235 | * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 |
236 | * |
237 | * 1 1,2 0 |
238 | * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 |
239 | * |
240 | * 2 0 1,2 |
241 | * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 |
242 | * |
243 | * 3 0 1,2 |
244 | * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 |
245 | * |
246 | * 4 0,1,2 |
247 | * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 |
248 | * |
249 | * 5 0 1 2 |
250 | * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 |
251 | * |
252 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 |
253 | * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 |
254 | * |
255 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 |
256 | * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 |
257 | * |
258 | * - at the next field, start with a new, all-zeroes result bitmap. For each |
259 | * bit set in the previous result bitmap, fill the new result bitmap |
260 | * (fill_map in pipapo_lookup()) with the rule indices from the |
261 | * corresponding buckets of the mapping field for this field, done by |
262 | * pipapo_refill() |
263 | * |
264 | * Example: with mapping table from insertion examples, with the current |
265 | * result bitmap from the previous example, 0x02: |
266 | * |
267 | * :: |
268 | * |
269 | * rule indices in current field: 0 1 2 |
270 | * map to rules in next field: 0 1 1 |
271 | * |
272 | * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be |
273 | * set. |
274 | * |
275 | * We can now extend this example to cover the second iteration of the step |
276 | * above (lookup and AND bitmap): assuming the port field is |
277 | * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table |
278 | * for "port" field from pre-computation example: |
279 | * |
280 | * :: |
281 | * |
282 | * bucket |
283 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
284 | * 0 0,1 |
285 | * 1 0,1 |
286 | * 2 0 1 |
287 | * 3 0,1 |
288 | * |
289 | * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] |
290 | * & 0x3 [bucket 0], resulting bitmap is 0x2. |
291 | * |
292 | * - if this is the last field in the set, look up the value from the mapping |
293 | * array corresponding to the final result bitmap |
294 | * |
295 | * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for |
296 | * last field from insertion example: |
297 | * |
298 | * :: |
299 | * |
300 | * rule indices in last field: 0 1 |
301 | * map to elements: 0x66 0x42 |
302 | * |
303 | * the matching element is at 0x42. |
304 | * |
305 | * |
306 | * References |
307 | * ---------- |
308 | * |
309 | * [Ligatti 2010] |
310 | * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with |
311 | * Automatic Time-space Tradeoffs |
312 | * Jay Ligatti, Josh Kuhn, and Chris Gage. |
313 | * Proceedings of the IEEE International Conference on Computer |
314 | * Communication Networks (ICCCN), August 2010. |
315 | * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf |
316 | * |
317 | * [Rottenstreich 2010] |
318 | * Worst-Case TCAM Rule Expansion |
319 | * Ori Rottenstreich and Isaac Keslassy. |
320 | * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. |
321 | * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf |
322 | * |
323 | * [Kogan 2014] |
324 | * SAX-PAC (Scalable And eXpressive PAcket Classification) |
325 | * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, |
326 | * and Patrick Eugster. |
327 | * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. |
328 | * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf |
329 | */ |
330 | |
331 | #include <linux/kernel.h> |
332 | #include <linux/init.h> |
333 | #include <linux/module.h> |
334 | #include <linux/netlink.h> |
335 | #include <linux/netfilter.h> |
336 | #include <linux/netfilter/nf_tables.h> |
337 | #include <net/netfilter/nf_tables_core.h> |
338 | #include <uapi/linux/netfilter/nf_tables.h> |
339 | #include <linux/bitmap.h> |
340 | #include <linux/bitops.h> |
341 | |
342 | #include "nft_set_pipapo_avx2.h" |
343 | #include "nft_set_pipapo.h" |
344 | |
345 | /** |
346 | * pipapo_refill() - For each set bit, set bits from selected mapping table item |
347 | * @map: Bitmap to be scanned for set bits |
348 | * @len: Length of bitmap in longs |
349 | * @rules: Number of rules in field |
350 | * @dst: Destination bitmap |
351 | * @mt: Mapping table containing bit set specifiers |
352 | * @match_only: Find a single bit and return, don't fill |
353 | * |
354 | * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. |
355 | * |
356 | * For each bit set in map, select the bucket from mapping table with index |
357 | * corresponding to the position of the bit set. Use start bit and amount of |
358 | * bits specified in bucket to fill region in dst. |
359 | * |
360 | * Return: -1 on no match, bit position on 'match_only', 0 otherwise. |
361 | */ |
362 | int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, |
363 | unsigned long *dst, |
364 | const union nft_pipapo_map_bucket *mt, bool match_only) |
365 | { |
366 | unsigned long bitset; |
367 | unsigned int k; |
368 | int ret = -1; |
369 | |
370 | for (k = 0; k < len; k++) { |
371 | bitset = map[k]; |
372 | while (bitset) { |
373 | unsigned long t = bitset & -bitset; |
374 | int r = __builtin_ctzl(bitset); |
375 | int i = k * BITS_PER_LONG + r; |
376 | |
377 | if (unlikely(i >= rules)) { |
378 | map[k] = 0; |
379 | return -1; |
380 | } |
381 | |
382 | if (match_only) { |
383 | bitmap_clear(map, start: i, nbits: 1); |
384 | return i; |
385 | } |
386 | |
387 | ret = 0; |
388 | |
389 | bitmap_set(map: dst, start: mt[i].to, nbits: mt[i].n); |
390 | |
391 | bitset ^= t; |
392 | } |
393 | map[k] = 0; |
394 | } |
395 | |
396 | return ret; |
397 | } |
398 | |
399 | /** |
400 | * nft_pipapo_lookup() - Lookup function |
401 | * @net: Network namespace |
402 | * @set: nftables API set representation |
403 | * @key: nftables API element representation containing key data |
404 | * @ext: nftables API extension pointer, filled with matching reference |
405 | * |
406 | * For more details, see DOC: Theory of Operation. |
407 | * |
408 | * Return: true on match, false otherwise. |
409 | */ |
410 | bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, |
411 | const u32 *key, const struct nft_set_ext **ext) |
412 | { |
413 | struct nft_pipapo *priv = nft_set_priv(set); |
414 | struct nft_pipapo_scratch *scratch; |
415 | unsigned long *res_map, *fill_map; |
416 | u8 genmask = nft_genmask_cur(net); |
417 | const struct nft_pipapo_match *m; |
418 | const struct nft_pipapo_field *f; |
419 | const u8 *rp = (const u8 *)key; |
420 | bool map_index; |
421 | int i; |
422 | |
423 | local_bh_disable(); |
424 | |
425 | m = rcu_dereference(priv->match); |
426 | |
427 | if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) |
428 | goto out; |
429 | |
430 | scratch = *raw_cpu_ptr(m->scratch); |
431 | |
432 | map_index = scratch->map_index; |
433 | |
434 | res_map = scratch->map + (map_index ? m->bsize_max : 0); |
435 | fill_map = scratch->map + (map_index ? 0 : m->bsize_max); |
436 | |
437 | memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); |
438 | |
439 | nft_pipapo_for_each_field(f, i, m) { |
440 | bool last = i == m->field_count - 1; |
441 | int b; |
442 | |
443 | /* For each bit group: select lookup table bucket depending on |
444 | * packet bytes value, then AND bucket value |
445 | */ |
446 | if (likely(f->bb == 8)) |
447 | pipapo_and_field_buckets_8bit(f, dst: res_map, data: rp); |
448 | else |
449 | pipapo_and_field_buckets_4bit(f, dst: res_map, data: rp); |
450 | NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; |
451 | |
452 | rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); |
453 | |
454 | /* Now populate the bitmap for the next field, unless this is |
455 | * the last field, in which case return the matched 'ext' |
456 | * pointer if any. |
457 | * |
458 | * Now res_map contains the matching bitmap, and fill_map is the |
459 | * bitmap for the next field. |
460 | */ |
461 | next_match: |
462 | b = pipapo_refill(map: res_map, len: f->bsize, rules: f->rules, dst: fill_map, mt: f->mt, |
463 | match_only: last); |
464 | if (b < 0) { |
465 | scratch->map_index = map_index; |
466 | local_bh_enable(); |
467 | |
468 | return false; |
469 | } |
470 | |
471 | if (last) { |
472 | *ext = &f->mt[b].e->ext; |
473 | if (unlikely(nft_set_elem_expired(*ext) || |
474 | !nft_set_elem_active(*ext, genmask))) |
475 | goto next_match; |
476 | |
477 | /* Last field: we're just returning the key without |
478 | * filling the initial bitmap for the next field, so the |
479 | * current inactive bitmap is clean and can be reused as |
480 | * *next* bitmap (not initial) for the next packet. |
481 | */ |
482 | scratch->map_index = map_index; |
483 | local_bh_enable(); |
484 | |
485 | return true; |
486 | } |
487 | |
488 | /* Swap bitmap indices: res_map is the initial bitmap for the |
489 | * next field, and fill_map is guaranteed to be all-zeroes at |
490 | * this point. |
491 | */ |
492 | map_index = !map_index; |
493 | swap(res_map, fill_map); |
494 | |
495 | rp += NFT_PIPAPO_GROUPS_PADDING(f); |
496 | } |
497 | |
498 | out: |
499 | local_bh_enable(); |
500 | return false; |
501 | } |
502 | |
503 | /** |
504 | * pipapo_get() - Get matching element reference given key data |
505 | * @net: Network namespace |
506 | * @set: nftables API set representation |
507 | * @data: Key data to be matched against existing elements |
508 | * @genmask: If set, check that element is active in given genmask |
509 | * @tstamp: timestamp to check for expired elements |
510 | * @gfp: the type of memory to allocate (see kmalloc). |
511 | * |
512 | * This is essentially the same as the lookup function, except that it matches |
513 | * key data against the uncommitted copy and doesn't use preallocated maps for |
514 | * bitmap results. |
515 | * |
516 | * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. |
517 | */ |
518 | static struct nft_pipapo_elem *pipapo_get(const struct net *net, |
519 | const struct nft_set *set, |
520 | const u8 *data, u8 genmask, |
521 | u64 tstamp, gfp_t gfp) |
522 | { |
523 | struct nft_pipapo_elem *ret = ERR_PTR(error: -ENOENT); |
524 | struct nft_pipapo *priv = nft_set_priv(set); |
525 | unsigned long *res_map, *fill_map = NULL; |
526 | const struct nft_pipapo_match *m; |
527 | const struct nft_pipapo_field *f; |
528 | int i; |
529 | |
530 | m = priv->clone; |
531 | if (m->bsize_max == 0) |
532 | return ret; |
533 | |
534 | res_map = kmalloc_array(n: m->bsize_max, size: sizeof(*res_map), flags: gfp); |
535 | if (!res_map) { |
536 | ret = ERR_PTR(error: -ENOMEM); |
537 | goto out; |
538 | } |
539 | |
540 | fill_map = kcalloc(n: m->bsize_max, size: sizeof(*res_map), flags: gfp); |
541 | if (!fill_map) { |
542 | ret = ERR_PTR(error: -ENOMEM); |
543 | goto out; |
544 | } |
545 | |
546 | memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); |
547 | |
548 | nft_pipapo_for_each_field(f, i, m) { |
549 | bool last = i == m->field_count - 1; |
550 | int b; |
551 | |
552 | /* For each bit group: select lookup table bucket depending on |
553 | * packet bytes value, then AND bucket value |
554 | */ |
555 | if (f->bb == 8) |
556 | pipapo_and_field_buckets_8bit(f, dst: res_map, data); |
557 | else if (f->bb == 4) |
558 | pipapo_and_field_buckets_4bit(f, dst: res_map, data); |
559 | else |
560 | BUG(); |
561 | |
562 | data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); |
563 | |
564 | /* Now populate the bitmap for the next field, unless this is |
565 | * the last field, in which case return the matched 'ext' |
566 | * pointer if any. |
567 | * |
568 | * Now res_map contains the matching bitmap, and fill_map is the |
569 | * bitmap for the next field. |
570 | */ |
571 | next_match: |
572 | b = pipapo_refill(map: res_map, len: f->bsize, rules: f->rules, dst: fill_map, mt: f->mt, |
573 | match_only: last); |
574 | if (b < 0) |
575 | goto out; |
576 | |
577 | if (last) { |
578 | if (__nft_set_elem_expired(ext: &f->mt[b].e->ext, tstamp)) |
579 | goto next_match; |
580 | if ((genmask && |
581 | !nft_set_elem_active(ext: &f->mt[b].e->ext, genmask))) |
582 | goto next_match; |
583 | |
584 | ret = f->mt[b].e; |
585 | goto out; |
586 | } |
587 | |
588 | data += NFT_PIPAPO_GROUPS_PADDING(f); |
589 | |
590 | /* Swap bitmap indices: fill_map will be the initial bitmap for |
591 | * the next field (i.e. the new res_map), and res_map is |
592 | * guaranteed to be all-zeroes at this point, ready to be filled |
593 | * according to the next mapping table. |
594 | */ |
595 | swap(res_map, fill_map); |
596 | } |
597 | |
598 | out: |
599 | kfree(objp: fill_map); |
600 | kfree(objp: res_map); |
601 | return ret; |
602 | } |
603 | |
604 | /** |
605 | * nft_pipapo_get() - Get matching element reference given key data |
606 | * @net: Network namespace |
607 | * @set: nftables API set representation |
608 | * @elem: nftables API element representation containing key data |
609 | * @flags: Unused |
610 | */ |
611 | static struct nft_elem_priv * |
612 | nft_pipapo_get(const struct net *net, const struct nft_set *set, |
613 | const struct nft_set_elem *elem, unsigned int flags) |
614 | { |
615 | struct nft_pipapo_elem *e; |
616 | |
617 | e = pipapo_get(net, set, data: (const u8 *)elem->key.val.data, |
618 | genmask: nft_genmask_cur(net), tstamp: get_jiffies_64(), |
619 | GFP_ATOMIC); |
620 | if (IS_ERR(ptr: e)) |
621 | return ERR_CAST(ptr: e); |
622 | |
623 | return &e->priv; |
624 | } |
625 | |
626 | /** |
627 | * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize |
628 | * @f: Field containing mapping table |
629 | * @old_rules: Amount of existing mapped rules |
630 | * @rules: Amount of new rules to map |
631 | * |
632 | * Return: 0 on success, negative error code on failure. |
633 | */ |
634 | static int pipapo_realloc_mt(struct nft_pipapo_field *f, |
635 | unsigned int old_rules, unsigned int rules) |
636 | { |
637 | union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt; |
638 | const unsigned int = PAGE_SIZE / sizeof(*new_mt); |
639 | unsigned int rules_alloc = rules; |
640 | |
641 | might_sleep(); |
642 | |
643 | if (unlikely(rules == 0)) |
644 | goto out_free; |
645 | |
646 | /* growing and enough space left, no action needed */ |
647 | if (rules > old_rules && f->rules_alloc > rules) |
648 | return 0; |
649 | |
650 | /* downsize and extra slack has not grown too large */ |
651 | if (rules < old_rules) { |
652 | unsigned int remove = f->rules_alloc - rules; |
653 | |
654 | if (remove < (2u * extra)) |
655 | return 0; |
656 | } |
657 | |
658 | /* If set needs more than one page of memory for rules then |
659 | * allocate another extra page to avoid frequent reallocation. |
660 | */ |
661 | if (rules > extra && |
662 | check_add_overflow(rules, extra, &rules_alloc)) |
663 | return -EOVERFLOW; |
664 | |
665 | new_mt = kvmalloc_array(n: rules_alloc, size: sizeof(*new_mt), GFP_KERNEL); |
666 | if (!new_mt) |
667 | return -ENOMEM; |
668 | |
669 | if (old_mt) |
670 | memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt)); |
671 | |
672 | if (rules > old_rules) { |
673 | memset(new_mt + old_rules, 0, |
674 | (rules - old_rules) * sizeof(*new_mt)); |
675 | } |
676 | out_free: |
677 | f->rules_alloc = rules_alloc; |
678 | f->mt = new_mt; |
679 | |
680 | kvfree(addr: old_mt); |
681 | |
682 | return 0; |
683 | } |
684 | |
685 | /** |
686 | * pipapo_resize() - Resize lookup or mapping table, or both |
687 | * @f: Field containing lookup and mapping tables |
688 | * @old_rules: Previous amount of rules in field |
689 | * @rules: New amount of rules |
690 | * |
691 | * Increase, decrease or maintain tables size depending on new amount of rules, |
692 | * and copy data over. In case the new size is smaller, throw away data for |
693 | * highest-numbered rules. |
694 | * |
695 | * Return: 0 on success, -ENOMEM on allocation failure. |
696 | */ |
697 | static int pipapo_resize(struct nft_pipapo_field *f, |
698 | unsigned int old_rules, unsigned int rules) |
699 | { |
700 | long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; |
701 | unsigned int new_bucket_size, copy; |
702 | int group, bucket, err; |
703 | |
704 | if (rules >= NFT_PIPAPO_RULE0_MAX) |
705 | return -ENOSPC; |
706 | |
707 | new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); |
708 | #ifdef NFT_PIPAPO_ALIGN |
709 | new_bucket_size = roundup(new_bucket_size, |
710 | NFT_PIPAPO_ALIGN / sizeof(*new_lt)); |
711 | #endif |
712 | |
713 | if (new_bucket_size == f->bsize) |
714 | goto mt; |
715 | |
716 | if (new_bucket_size > f->bsize) |
717 | copy = f->bsize; |
718 | else |
719 | copy = new_bucket_size; |
720 | |
721 | new_lt = kvzalloc(size: f->groups * NFT_PIPAPO_BUCKETS(f->bb) * |
722 | new_bucket_size * sizeof(*new_lt) + |
723 | NFT_PIPAPO_ALIGN_HEADROOM, |
724 | GFP_KERNEL); |
725 | if (!new_lt) |
726 | return -ENOMEM; |
727 | |
728 | new_p = NFT_PIPAPO_LT_ALIGN(new_lt); |
729 | old_p = NFT_PIPAPO_LT_ALIGN(old_lt); |
730 | |
731 | for (group = 0; group < f->groups; group++) { |
732 | for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) { |
733 | memcpy(new_p, old_p, copy * sizeof(*new_p)); |
734 | new_p += copy; |
735 | old_p += copy; |
736 | |
737 | if (new_bucket_size > f->bsize) |
738 | new_p += new_bucket_size - f->bsize; |
739 | else |
740 | old_p += f->bsize - new_bucket_size; |
741 | } |
742 | } |
743 | |
744 | mt: |
745 | err = pipapo_realloc_mt(f, old_rules, rules); |
746 | if (err) { |
747 | kvfree(addr: new_lt); |
748 | return err; |
749 | } |
750 | |
751 | if (new_lt) { |
752 | f->bsize = new_bucket_size; |
753 | f->lt = new_lt; |
754 | kvfree(addr: old_lt); |
755 | } |
756 | |
757 | return 0; |
758 | } |
759 | |
760 | /** |
761 | * pipapo_bucket_set() - Set rule bit in bucket given group and group value |
762 | * @f: Field containing lookup table |
763 | * @rule: Rule index |
764 | * @group: Group index |
765 | * @v: Value of bit group |
766 | */ |
767 | static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, |
768 | int v) |
769 | { |
770 | unsigned long *pos; |
771 | |
772 | pos = NFT_PIPAPO_LT_ALIGN(f->lt); |
773 | pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group; |
774 | pos += f->bsize * v; |
775 | |
776 | __set_bit(rule, pos); |
777 | } |
778 | |
779 | /** |
780 | * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits |
781 | * @old_groups: Number of current groups |
782 | * @bsize: Size of one bucket, in longs |
783 | * @old_lt: Pointer to the current lookup table |
784 | * @new_lt: Pointer to the new, pre-allocated lookup table |
785 | * |
786 | * Each bucket with index b in the new lookup table, belonging to group g, is |
787 | * filled with the bit intersection between: |
788 | * - bucket with index given by the upper 4 bits of b, from group g, and |
789 | * - bucket with index given by the lower 4 bits of b, from group g + 1 |
790 | * |
791 | * That is, given buckets from the new lookup table N(x, y) and the old lookup |
792 | * table O(x, y), with x bucket index, and y group index: |
793 | * |
794 | * N(b, g) := O(b / 16, g) & O(b % 16, g + 1) |
795 | * |
796 | * This ensures equivalence of the matching results on lookup. Two examples in |
797 | * pictures: |
798 | * |
799 | * bucket |
800 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255 |
801 | * 0 ^ |
802 | * 1 | ^ |
803 | * ... ( & ) | |
804 | * / \ | |
805 | * / \ .-( & )-. |
806 | * / bucket \ | | |
807 | * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 | |
808 | * 0 / \ | | |
809 | * 1 \ | | |
810 | * 2 | --' |
811 | * 3 '- |
812 | * ... |
813 | */ |
814 | static void pipapo_lt_4b_to_8b(int old_groups, int bsize, |
815 | unsigned long *old_lt, unsigned long *new_lt) |
816 | { |
817 | int g, b, i; |
818 | |
819 | for (g = 0; g < old_groups / 2; g++) { |
820 | int src_g0 = g * 2, src_g1 = g * 2 + 1; |
821 | |
822 | for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) { |
823 | int src_b0 = b / NFT_PIPAPO_BUCKETS(4); |
824 | int src_b1 = b % NFT_PIPAPO_BUCKETS(4); |
825 | int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0; |
826 | int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1; |
827 | |
828 | for (i = 0; i < bsize; i++) { |
829 | *new_lt = old_lt[src_i0 * bsize + i] & |
830 | old_lt[src_i1 * bsize + i]; |
831 | new_lt++; |
832 | } |
833 | } |
834 | } |
835 | } |
836 | |
837 | /** |
838 | * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits |
839 | * @old_groups: Number of current groups |
840 | * @bsize: Size of one bucket, in longs |
841 | * @old_lt: Pointer to the current lookup table |
842 | * @new_lt: Pointer to the new, pre-allocated lookup table |
843 | * |
844 | * Each bucket with index b in the new lookup table, belonging to group g, is |
845 | * filled with the bit union of: |
846 | * - all the buckets with index such that the upper four bits of the lower byte |
847 | * equal b, from group g, with g odd |
848 | * - all the buckets with index such that the lower four bits equal b, from |
849 | * group g, with g even |
850 | * |
851 | * That is, given buckets from the new lookup table N(x, y) and the old lookup |
852 | * table O(x, y), with x bucket index, and y group index: |
853 | * |
854 | * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4) |
855 | * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f) |
856 | * |
857 | * where U() denotes the arbitrary union operation (binary OR of n terms). This |
858 | * ensures equivalence of the matching results on lookup. |
859 | */ |
860 | static void pipapo_lt_8b_to_4b(int old_groups, int bsize, |
861 | unsigned long *old_lt, unsigned long *new_lt) |
862 | { |
863 | int g, b, bsrc, i; |
864 | |
865 | memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize * |
866 | sizeof(unsigned long)); |
867 | |
868 | for (g = 0; g < old_groups * 2; g += 2) { |
869 | int src_g = g / 2; |
870 | |
871 | for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { |
872 | for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; |
873 | bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); |
874 | bsrc++) { |
875 | if (((bsrc & 0xf0) >> 4) != b) |
876 | continue; |
877 | |
878 | for (i = 0; i < bsize; i++) |
879 | new_lt[i] |= old_lt[bsrc * bsize + i]; |
880 | } |
881 | |
882 | new_lt += bsize; |
883 | } |
884 | |
885 | for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { |
886 | for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; |
887 | bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); |
888 | bsrc++) { |
889 | if ((bsrc & 0x0f) != b) |
890 | continue; |
891 | |
892 | for (i = 0; i < bsize; i++) |
893 | new_lt[i] |= old_lt[bsrc * bsize + i]; |
894 | } |
895 | |
896 | new_lt += bsize; |
897 | } |
898 | } |
899 | } |
900 | |
901 | /** |
902 | * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed |
903 | * @f: Field containing lookup table |
904 | */ |
905 | static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) |
906 | { |
907 | unsigned int groups, bb; |
908 | unsigned long *new_lt; |
909 | size_t lt_size; |
910 | |
911 | lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * |
912 | sizeof(*f->lt); |
913 | |
914 | if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET && |
915 | lt_size > NFT_PIPAPO_LT_SIZE_HIGH) { |
916 | groups = f->groups * 2; |
917 | bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; |
918 | |
919 | lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * |
920 | sizeof(*f->lt); |
921 | } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && |
922 | lt_size < NFT_PIPAPO_LT_SIZE_LOW) { |
923 | groups = f->groups / 2; |
924 | bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; |
925 | |
926 | lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * |
927 | sizeof(*f->lt); |
928 | |
929 | /* Don't increase group width if the resulting lookup table size |
930 | * would exceed the upper size threshold for a "small" set. |
931 | */ |
932 | if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH) |
933 | return; |
934 | } else { |
935 | return; |
936 | } |
937 | |
938 | new_lt = kvzalloc(size: lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); |
939 | if (!new_lt) |
940 | return; |
941 | |
942 | NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; |
943 | if (f->bb == 4 && bb == 8) { |
944 | pipapo_lt_4b_to_8b(old_groups: f->groups, bsize: f->bsize, |
945 | NFT_PIPAPO_LT_ALIGN(f->lt), |
946 | NFT_PIPAPO_LT_ALIGN(new_lt)); |
947 | } else if (f->bb == 8 && bb == 4) { |
948 | pipapo_lt_8b_to_4b(old_groups: f->groups, bsize: f->bsize, |
949 | NFT_PIPAPO_LT_ALIGN(f->lt), |
950 | NFT_PIPAPO_LT_ALIGN(new_lt)); |
951 | } else { |
952 | BUG(); |
953 | } |
954 | |
955 | f->groups = groups; |
956 | f->bb = bb; |
957 | kvfree(addr: f->lt); |
958 | f->lt = new_lt; |
959 | } |
960 | |
961 | /** |
962 | * pipapo_insert() - Insert new rule in field given input key and mask length |
963 | * @f: Field containing lookup table |
964 | * @k: Input key for classification, without nftables padding |
965 | * @mask_bits: Length of mask; matches field length for non-ranged entry |
966 | * |
967 | * Insert a new rule reference in lookup buckets corresponding to k and |
968 | * mask_bits. |
969 | * |
970 | * Return: 1 on success (one rule inserted), negative error code on failure. |
971 | */ |
972 | static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, |
973 | int mask_bits) |
974 | { |
975 | unsigned int rule = f->rules, group, ret, bit_offset = 0; |
976 | |
977 | ret = pipapo_resize(f, old_rules: f->rules, rules: f->rules + 1); |
978 | if (ret) |
979 | return ret; |
980 | |
981 | f->rules++; |
982 | |
983 | for (group = 0; group < f->groups; group++) { |
984 | int i, v; |
985 | u8 mask; |
986 | |
987 | v = k[group / (BITS_PER_BYTE / f->bb)]; |
988 | v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0); |
989 | v >>= (BITS_PER_BYTE - bit_offset) - f->bb; |
990 | |
991 | bit_offset += f->bb; |
992 | bit_offset %= BITS_PER_BYTE; |
993 | |
994 | if (mask_bits >= (group + 1) * f->bb) { |
995 | /* Not masked */ |
996 | pipapo_bucket_set(f, rule, group, v); |
997 | } else if (mask_bits <= group * f->bb) { |
998 | /* Completely masked */ |
999 | for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) |
1000 | pipapo_bucket_set(f, rule, group, v: i); |
1001 | } else { |
1002 | /* The mask limit falls on this group */ |
1003 | mask = GENMASK(f->bb - 1, 0); |
1004 | mask >>= mask_bits - group * f->bb; |
1005 | for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) { |
1006 | if ((i & ~mask) == (v & ~mask)) |
1007 | pipapo_bucket_set(f, rule, group, v: i); |
1008 | } |
1009 | } |
1010 | } |
1011 | |
1012 | pipapo_lt_bits_adjust(f); |
1013 | |
1014 | return 1; |
1015 | } |
1016 | |
1017 | /** |
1018 | * pipapo_step_diff() - Check if setting @step bit in netmask would change it |
1019 | * @base: Mask we are expanding |
1020 | * @step: Step bit for given expansion step |
1021 | * @len: Total length of mask space (set and unset bits), bytes |
1022 | * |
1023 | * Convenience function for mask expansion. |
1024 | * |
1025 | * Return: true if step bit changes mask (i.e. isn't set), false otherwise. |
1026 | */ |
1027 | static bool pipapo_step_diff(u8 *base, int step, int len) |
1028 | { |
1029 | /* Network order, byte-addressed */ |
1030 | #ifdef __BIG_ENDIAN__ |
1031 | return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); |
1032 | #else |
1033 | return !(BIT(step % BITS_PER_BYTE) & |
1034 | base[len - 1 - step / BITS_PER_BYTE]); |
1035 | #endif |
1036 | } |
1037 | |
1038 | /** |
1039 | * pipapo_step_after_end() - Check if mask exceeds range end with given step |
1040 | * @base: Mask we are expanding |
1041 | * @end: End of range |
1042 | * @step: Step bit for given expansion step, highest bit to be set |
1043 | * @len: Total length of mask space (set and unset bits), bytes |
1044 | * |
1045 | * Convenience function for mask expansion. |
1046 | * |
1047 | * Return: true if mask exceeds range setting step bits, false otherwise. |
1048 | */ |
1049 | static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, |
1050 | int len) |
1051 | { |
1052 | u8 tmp[NFT_PIPAPO_MAX_BYTES]; |
1053 | int i; |
1054 | |
1055 | memcpy(tmp, base, len); |
1056 | |
1057 | /* Network order, byte-addressed */ |
1058 | for (i = 0; i <= step; i++) |
1059 | #ifdef __BIG_ENDIAN__ |
1060 | tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); |
1061 | #else |
1062 | tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); |
1063 | #endif |
1064 | |
1065 | return memcmp(p: tmp, q: end, size: len) > 0; |
1066 | } |
1067 | |
1068 | /** |
1069 | * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry |
1070 | * @base: Netmask base |
1071 | * @step: Step bit to sum |
1072 | * @len: Netmask length, bytes |
1073 | */ |
1074 | static void pipapo_base_sum(u8 *base, int step, int len) |
1075 | { |
1076 | bool carry = false; |
1077 | int i; |
1078 | |
1079 | /* Network order, byte-addressed */ |
1080 | #ifdef __BIG_ENDIAN__ |
1081 | for (i = step / BITS_PER_BYTE; i < len; i++) { |
1082 | #else |
1083 | for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { |
1084 | #endif |
1085 | if (carry) |
1086 | base[i]++; |
1087 | else |
1088 | base[i] += 1 << (step % BITS_PER_BYTE); |
1089 | |
1090 | if (base[i]) |
1091 | break; |
1092 | |
1093 | carry = true; |
1094 | } |
1095 | } |
1096 | |
1097 | /** |
1098 | * pipapo_expand() - Expand to composing netmasks, insert into lookup table |
1099 | * @f: Field containing lookup table |
1100 | * @start: Start of range |
1101 | * @end: End of range |
1102 | * @len: Length of value in bits |
1103 | * |
1104 | * Expand range to composing netmasks and insert corresponding rule references |
1105 | * in lookup buckets. |
1106 | * |
1107 | * Return: number of inserted rules on success, negative error code on failure. |
1108 | */ |
1109 | static int pipapo_expand(struct nft_pipapo_field *f, |
1110 | const u8 *start, const u8 *end, int len) |
1111 | { |
1112 | int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); |
1113 | u8 base[NFT_PIPAPO_MAX_BYTES]; |
1114 | |
1115 | memcpy(base, start, bytes); |
1116 | while (memcmp(p: base, q: end, size: bytes) <= 0) { |
1117 | int err; |
1118 | |
1119 | step = 0; |
1120 | while (pipapo_step_diff(base, step, len: bytes)) { |
1121 | if (pipapo_step_after_end(base, end, step, len: bytes)) |
1122 | break; |
1123 | |
1124 | step++; |
1125 | if (step >= len) { |
1126 | if (!masks) { |
1127 | err = pipapo_insert(f, k: base, mask_bits: 0); |
1128 | if (err < 0) |
1129 | return err; |
1130 | masks = 1; |
1131 | } |
1132 | goto out; |
1133 | } |
1134 | } |
1135 | |
1136 | err = pipapo_insert(f, k: base, mask_bits: len - step); |
1137 | |
1138 | if (err < 0) |
1139 | return err; |
1140 | |
1141 | masks++; |
1142 | pipapo_base_sum(base, step, len: bytes); |
1143 | } |
1144 | out: |
1145 | return masks; |
1146 | } |
1147 | |
1148 | /** |
1149 | * pipapo_map() - Insert rules in mapping tables, mapping them between fields |
1150 | * @m: Matching data, including mapping table |
1151 | * @map: Table of rule maps: array of first rule and amount of rules |
1152 | * in next field a given rule maps to, for each field |
1153 | * @e: For last field, nft_set_ext pointer matching rules map to |
1154 | */ |
1155 | static void pipapo_map(struct nft_pipapo_match *m, |
1156 | union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], |
1157 | struct nft_pipapo_elem *e) |
1158 | { |
1159 | struct nft_pipapo_field *f; |
1160 | int i, j; |
1161 | |
1162 | for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { |
1163 | for (j = 0; j < map[i].n; j++) { |
1164 | f->mt[map[i].to + j].to = map[i + 1].to; |
1165 | f->mt[map[i].to + j].n = map[i + 1].n; |
1166 | } |
1167 | } |
1168 | |
1169 | /* Last field: map to ext instead of mapping to next field */ |
1170 | for (j = 0; j < map[i].n; j++) |
1171 | f->mt[map[i].to + j].e = e; |
1172 | } |
1173 | |
1174 | /** |
1175 | * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address |
1176 | * @m: Matching data |
1177 | * @cpu: CPU number |
1178 | */ |
1179 | static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) |
1180 | { |
1181 | struct nft_pipapo_scratch *s; |
1182 | void *mem; |
1183 | |
1184 | s = *per_cpu_ptr(m->scratch, cpu); |
1185 | if (!s) |
1186 | return; |
1187 | |
1188 | mem = s; |
1189 | mem -= s->align_off; |
1190 | kfree(objp: mem); |
1191 | } |
1192 | |
1193 | /** |
1194 | * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results |
1195 | * @clone: Copy of matching data with pending insertions and deletions |
1196 | * @bsize_max: Maximum bucket size, scratch maps cover two buckets |
1197 | * |
1198 | * Return: 0 on success, -ENOMEM on failure. |
1199 | */ |
1200 | static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, |
1201 | unsigned long bsize_max) |
1202 | { |
1203 | int i; |
1204 | |
1205 | for_each_possible_cpu(i) { |
1206 | struct nft_pipapo_scratch *scratch; |
1207 | #ifdef NFT_PIPAPO_ALIGN |
1208 | void *scratch_aligned; |
1209 | u32 align_off; |
1210 | #endif |
1211 | scratch = kzalloc_node(struct_size(scratch, map, |
1212 | bsize_max * 2) + |
1213 | NFT_PIPAPO_ALIGN_HEADROOM, |
1214 | GFP_KERNEL, cpu_to_node(cpu: i)); |
1215 | if (!scratch) { |
1216 | /* On failure, there's no need to undo previous |
1217 | * allocations: this means that some scratch maps have |
1218 | * a bigger allocated size now (this is only called on |
1219 | * insertion), but the extra space won't be used by any |
1220 | * CPU as new elements are not inserted and m->bsize_max |
1221 | * is not updated. |
1222 | */ |
1223 | return -ENOMEM; |
1224 | } |
1225 | |
1226 | pipapo_free_scratch(m: clone, cpu: i); |
1227 | |
1228 | #ifdef NFT_PIPAPO_ALIGN |
1229 | /* Align &scratch->map (not the struct itself): the extra |
1230 | * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() |
1231 | * above guarantee we can waste up to those bytes in order |
1232 | * to align the map field regardless of its offset within |
1233 | * the struct. |
1234 | */ |
1235 | BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM); |
1236 | |
1237 | scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); |
1238 | scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); |
1239 | align_off = scratch_aligned - (void *)scratch; |
1240 | |
1241 | scratch = scratch_aligned; |
1242 | scratch->align_off = align_off; |
1243 | #endif |
1244 | *per_cpu_ptr(clone->scratch, i) = scratch; |
1245 | } |
1246 | |
1247 | return 0; |
1248 | } |
1249 | |
1250 | /** |
1251 | * nft_pipapo_insert() - Validate and insert ranged elements |
1252 | * @net: Network namespace |
1253 | * @set: nftables API set representation |
1254 | * @elem: nftables API element representation containing key data |
1255 | * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element |
1256 | * |
1257 | * Return: 0 on success, error pointer on failure. |
1258 | */ |
1259 | static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, |
1260 | const struct nft_set_elem *elem, |
1261 | struct nft_elem_priv **elem_priv) |
1262 | { |
1263 | const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv: elem->priv); |
1264 | union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; |
1265 | const u8 *start = (const u8 *)elem->key.val.data, *end; |
1266 | struct nft_pipapo *priv = nft_set_priv(set); |
1267 | struct nft_pipapo_match *m = priv->clone; |
1268 | u8 genmask = nft_genmask_next(net); |
1269 | struct nft_pipapo_elem *e, *dup; |
1270 | u64 tstamp = nft_net_tstamp(net); |
1271 | struct nft_pipapo_field *f; |
1272 | const u8 *start_p, *end_p; |
1273 | int i, bsize_max, err = 0; |
1274 | |
1275 | if (nft_set_ext_exists(ext, id: NFT_SET_EXT_KEY_END)) |
1276 | end = (const u8 *)nft_set_ext_key_end(ext)->data; |
1277 | else |
1278 | end = start; |
1279 | |
1280 | dup = pipapo_get(net, set, data: start, genmask, tstamp, GFP_KERNEL); |
1281 | if (!IS_ERR(ptr: dup)) { |
1282 | /* Check if we already have the same exact entry */ |
1283 | const struct nft_data *dup_key, *dup_end; |
1284 | |
1285 | dup_key = nft_set_ext_key(ext: &dup->ext); |
1286 | if (nft_set_ext_exists(ext: &dup->ext, id: NFT_SET_EXT_KEY_END)) |
1287 | dup_end = nft_set_ext_key_end(ext: &dup->ext); |
1288 | else |
1289 | dup_end = dup_key; |
1290 | |
1291 | if (!memcmp(p: start, q: dup_key->data, size: sizeof(*dup_key->data)) && |
1292 | !memcmp(p: end, q: dup_end->data, size: sizeof(*dup_end->data))) { |
1293 | *elem_priv = &dup->priv; |
1294 | return -EEXIST; |
1295 | } |
1296 | |
1297 | return -ENOTEMPTY; |
1298 | } |
1299 | |
1300 | if (PTR_ERR(ptr: dup) == -ENOENT) { |
1301 | /* Look for partially overlapping entries */ |
1302 | dup = pipapo_get(net, set, data: end, genmask: nft_genmask_next(net), tstamp, |
1303 | GFP_KERNEL); |
1304 | } |
1305 | |
1306 | if (PTR_ERR(ptr: dup) != -ENOENT) { |
1307 | if (IS_ERR(ptr: dup)) |
1308 | return PTR_ERR(ptr: dup); |
1309 | *elem_priv = &dup->priv; |
1310 | return -ENOTEMPTY; |
1311 | } |
1312 | |
1313 | /* Validate */ |
1314 | start_p = start; |
1315 | end_p = end; |
1316 | |
1317 | /* some helpers return -1, or 0 >= for valid rule pos, |
1318 | * so we cannot support more than INT_MAX rules at this time. |
1319 | */ |
1320 | BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX); |
1321 | |
1322 | nft_pipapo_for_each_field(f, i, m) { |
1323 | if (f->rules >= NFT_PIPAPO_RULE0_MAX) |
1324 | return -ENOSPC; |
1325 | |
1326 | if (memcmp(p: start_p, q: end_p, |
1327 | size: f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0) |
1328 | return -EINVAL; |
1329 | |
1330 | start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); |
1331 | end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); |
1332 | } |
1333 | |
1334 | /* Insert */ |
1335 | priv->dirty = true; |
1336 | |
1337 | bsize_max = m->bsize_max; |
1338 | |
1339 | nft_pipapo_for_each_field(f, i, m) { |
1340 | int ret; |
1341 | |
1342 | rulemap[i].to = f->rules; |
1343 | |
1344 | ret = memcmp(p: start, q: end, |
1345 | size: f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); |
1346 | if (!ret) |
1347 | ret = pipapo_insert(f, k: start, mask_bits: f->groups * f->bb); |
1348 | else |
1349 | ret = pipapo_expand(f, start, end, len: f->groups * f->bb); |
1350 | |
1351 | if (ret < 0) |
1352 | return ret; |
1353 | |
1354 | if (f->bsize > bsize_max) |
1355 | bsize_max = f->bsize; |
1356 | |
1357 | rulemap[i].n = ret; |
1358 | |
1359 | start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); |
1360 | end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); |
1361 | } |
1362 | |
1363 | if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { |
1364 | put_cpu_ptr(m->scratch); |
1365 | |
1366 | err = pipapo_realloc_scratch(clone: m, bsize_max); |
1367 | if (err) |
1368 | return err; |
1369 | |
1370 | m->bsize_max = bsize_max; |
1371 | } else { |
1372 | put_cpu_ptr(m->scratch); |
1373 | } |
1374 | |
1375 | e = nft_elem_priv_cast(priv: elem->priv); |
1376 | *elem_priv = &e->priv; |
1377 | |
1378 | pipapo_map(m, map: rulemap, e); |
1379 | |
1380 | return 0; |
1381 | } |
1382 | |
1383 | /** |
1384 | * pipapo_clone() - Clone matching data to create new working copy |
1385 | * @old: Existing matching data |
1386 | * |
1387 | * Return: copy of matching data passed as 'old', error pointer on failure |
1388 | */ |
1389 | static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) |
1390 | { |
1391 | struct nft_pipapo_field *dst, *src; |
1392 | struct nft_pipapo_match *new; |
1393 | int i; |
1394 | |
1395 | new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL); |
1396 | if (!new) |
1397 | return ERR_PTR(error: -ENOMEM); |
1398 | |
1399 | new->field_count = old->field_count; |
1400 | new->bsize_max = old->bsize_max; |
1401 | |
1402 | new->scratch = alloc_percpu(*new->scratch); |
1403 | if (!new->scratch) |
1404 | goto out_scratch; |
1405 | |
1406 | for_each_possible_cpu(i) |
1407 | *per_cpu_ptr(new->scratch, i) = NULL; |
1408 | |
1409 | if (pipapo_realloc_scratch(clone: new, bsize_max: old->bsize_max)) |
1410 | goto out_scratch_realloc; |
1411 | |
1412 | rcu_head_init(rhp: &new->rcu); |
1413 | |
1414 | src = old->f; |
1415 | dst = new->f; |
1416 | |
1417 | for (i = 0; i < old->field_count; i++) { |
1418 | unsigned long *new_lt; |
1419 | |
1420 | memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); |
1421 | |
1422 | new_lt = kvzalloc(size: src->groups * NFT_PIPAPO_BUCKETS(src->bb) * |
1423 | src->bsize * sizeof(*dst->lt) + |
1424 | NFT_PIPAPO_ALIGN_HEADROOM, |
1425 | GFP_KERNEL); |
1426 | if (!new_lt) |
1427 | goto out_lt; |
1428 | |
1429 | dst->lt = new_lt; |
1430 | |
1431 | memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), |
1432 | NFT_PIPAPO_LT_ALIGN(src->lt), |
1433 | src->bsize * sizeof(*dst->lt) * |
1434 | src->groups * NFT_PIPAPO_BUCKETS(src->bb)); |
1435 | |
1436 | if (src->rules > 0) { |
1437 | dst->mt = kvmalloc_array(n: src->rules_alloc, |
1438 | size: sizeof(*src->mt), GFP_KERNEL); |
1439 | if (!dst->mt) |
1440 | goto out_mt; |
1441 | |
1442 | memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); |
1443 | } else { |
1444 | dst->mt = NULL; |
1445 | dst->rules_alloc = 0; |
1446 | } |
1447 | |
1448 | src++; |
1449 | dst++; |
1450 | } |
1451 | |
1452 | return new; |
1453 | |
1454 | out_mt: |
1455 | kvfree(addr: dst->lt); |
1456 | out_lt: |
1457 | for (dst--; i > 0; i--) { |
1458 | kvfree(addr: dst->mt); |
1459 | kvfree(addr: dst->lt); |
1460 | dst--; |
1461 | } |
1462 | out_scratch_realloc: |
1463 | for_each_possible_cpu(i) |
1464 | pipapo_free_scratch(m: new, cpu: i); |
1465 | out_scratch: |
1466 | free_percpu(pdata: new->scratch); |
1467 | kfree(objp: new); |
1468 | |
1469 | return ERR_PTR(error: -ENOMEM); |
1470 | } |
1471 | |
1472 | /** |
1473 | * pipapo_rules_same_key() - Get number of rules originated from the same entry |
1474 | * @f: Field containing mapping table |
1475 | * @first: Index of first rule in set of rules mapping to same entry |
1476 | * |
1477 | * Using the fact that all rules in a field that originated from the same entry |
1478 | * will map to the same set of rules in the next field, or to the same element |
1479 | * reference, return the cardinality of the set of rules that originated from |
1480 | * the same entry as the rule with index @first, @first rule included. |
1481 | * |
1482 | * In pictures: |
1483 | * rules |
1484 | * field #0 0 1 2 3 4 |
1485 | * map to: 0 1 2-4 2-4 5-9 |
1486 | * . . ....... . ... |
1487 | * | | | | \ \ |
1488 | * | | | | \ \ |
1489 | * | | | | \ \ |
1490 | * ' ' ' ' ' \ |
1491 | * in field #1 0 1 2 3 4 5 ... |
1492 | * |
1493 | * if this is called for rule 2 on field #0, it will return 3, as also rules 2 |
1494 | * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. |
1495 | * |
1496 | * For the last field in a set, we can rely on associated entries to map to the |
1497 | * same element references. |
1498 | * |
1499 | * Return: Number of rules that originated from the same entry as @first. |
1500 | */ |
1501 | static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first) |
1502 | { |
1503 | struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ |
1504 | unsigned int r; |
1505 | |
1506 | for (r = first; r < f->rules; r++) { |
1507 | if (r != first && e != f->mt[r].e) |
1508 | return r - first; |
1509 | |
1510 | e = f->mt[r].e; |
1511 | } |
1512 | |
1513 | if (r != first) |
1514 | return r - first; |
1515 | |
1516 | return 0; |
1517 | } |
1518 | |
1519 | /** |
1520 | * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones |
1521 | * @mt: Mapping array |
1522 | * @rules: Original amount of rules in mapping table |
1523 | * @start: First rule index to be removed |
1524 | * @n: Amount of rules to be removed |
1525 | * @to_offset: First rule index, in next field, this group of rules maps to |
1526 | * @is_last: If this is the last field, delete reference from mapping array |
1527 | * |
1528 | * This is used to unmap rules from the mapping table for a single field, |
1529 | * maintaining consistency and compactness for the existing ones. |
1530 | * |
1531 | * In pictures: let's assume that we want to delete rules 2 and 3 from the |
1532 | * following mapping array: |
1533 | * |
1534 | * rules |
1535 | * 0 1 2 3 4 |
1536 | * map to: 4-10 4-10 11-15 11-15 16-18 |
1537 | * |
1538 | * the result will be: |
1539 | * |
1540 | * rules |
1541 | * 0 1 2 |
1542 | * map to: 4-10 4-10 11-13 |
1543 | * |
1544 | * for fields before the last one. In case this is the mapping table for the |
1545 | * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: |
1546 | * |
1547 | * rules |
1548 | * 0 1 2 3 4 |
1549 | * element pointers: 0x42 0x42 0x33 0x33 0x44 |
1550 | * |
1551 | * the result will be: |
1552 | * |
1553 | * rules |
1554 | * 0 1 2 |
1555 | * element pointers: 0x42 0x42 0x44 |
1556 | */ |
1557 | static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules, |
1558 | unsigned int start, unsigned int n, |
1559 | unsigned int to_offset, bool is_last) |
1560 | { |
1561 | int i; |
1562 | |
1563 | memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); |
1564 | memset(mt + rules - n, 0, n * sizeof(*mt)); |
1565 | |
1566 | if (is_last) |
1567 | return; |
1568 | |
1569 | for (i = start; i < rules - n; i++) |
1570 | mt[i].to -= to_offset; |
1571 | } |
1572 | |
1573 | /** |
1574 | * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map |
1575 | * @m: Matching data |
1576 | * @rulemap: Table of rule maps, arrays of first rule and amount of rules |
1577 | * in next field a given entry maps to, for each field |
1578 | * |
1579 | * For each rule in lookup table buckets mapping to this set of rules, drop |
1580 | * all bits set in lookup table mapping. In pictures, assuming we want to drop |
1581 | * rules 0 and 1 from this lookup table: |
1582 | * |
1583 | * bucket |
1584 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
1585 | * 0 0 1,2 |
1586 | * 1 1,2 0 |
1587 | * 2 0 1,2 |
1588 | * 3 0 1,2 |
1589 | * 4 0,1,2 |
1590 | * 5 0 1 2 |
1591 | * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 |
1592 | * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 |
1593 | * |
1594 | * rule 2 becomes rule 0, and the result will be: |
1595 | * |
1596 | * bucket |
1597 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
1598 | * 0 0 |
1599 | * 1 0 |
1600 | * 2 0 |
1601 | * 3 0 |
1602 | * 4 0 |
1603 | * 5 0 |
1604 | * 6 0 |
1605 | * 7 0 0 |
1606 | * |
1607 | * once this is done, call unmap() to drop all the corresponding rule references |
1608 | * from mapping tables. |
1609 | */ |
1610 | static void pipapo_drop(struct nft_pipapo_match *m, |
1611 | union nft_pipapo_map_bucket rulemap[]) |
1612 | { |
1613 | struct nft_pipapo_field *f; |
1614 | int i; |
1615 | |
1616 | nft_pipapo_for_each_field(f, i, m) { |
1617 | int g; |
1618 | |
1619 | for (g = 0; g < f->groups; g++) { |
1620 | unsigned long *pos; |
1621 | int b; |
1622 | |
1623 | pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g * |
1624 | NFT_PIPAPO_BUCKETS(f->bb) * f->bsize; |
1625 | |
1626 | for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { |
1627 | bitmap_cut(dst: pos, src: pos, first: rulemap[i].to, |
1628 | cut: rulemap[i].n, |
1629 | nbits: f->bsize * BITS_PER_LONG); |
1630 | |
1631 | pos += f->bsize; |
1632 | } |
1633 | } |
1634 | |
1635 | pipapo_unmap(mt: f->mt, rules: f->rules, start: rulemap[i].to, n: rulemap[i].n, |
1636 | to_offset: rulemap[i + 1].n, is_last: i == m->field_count - 1); |
1637 | if (pipapo_resize(f, old_rules: f->rules, rules: f->rules - rulemap[i].n)) { |
1638 | /* We can ignore this, a failure to shrink tables down |
1639 | * doesn't make tables invalid. |
1640 | */ |
1641 | ; |
1642 | } |
1643 | f->rules -= rulemap[i].n; |
1644 | |
1645 | pipapo_lt_bits_adjust(f); |
1646 | } |
1647 | } |
1648 | |
1649 | static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, |
1650 | struct nft_pipapo_elem *e) |
1651 | |
1652 | { |
1653 | nft_setelem_data_deactivate(net, set, elem_priv: &e->priv); |
1654 | } |
1655 | |
1656 | /** |
1657 | * pipapo_gc() - Drop expired entries from set, destroy start and end elements |
1658 | * @set: nftables API set representation |
1659 | * @m: Matching data |
1660 | */ |
1661 | static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) |
1662 | { |
1663 | struct nft_pipapo *priv = nft_set_priv(set); |
1664 | struct net *net = read_pnet(pnet: &set->net); |
1665 | unsigned int rules_f0, first_rule = 0; |
1666 | u64 tstamp = nft_net_tstamp(net); |
1667 | struct nft_pipapo_elem *e; |
1668 | struct nft_trans_gc *gc; |
1669 | |
1670 | gc = nft_trans_gc_alloc(set, gc_seq: 0, GFP_KERNEL); |
1671 | if (!gc) |
1672 | return; |
1673 | |
1674 | while ((rules_f0 = pipapo_rules_same_key(f: m->f, first: first_rule))) { |
1675 | union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; |
1676 | const struct nft_pipapo_field *f; |
1677 | unsigned int i, start, rules_fx; |
1678 | |
1679 | start = first_rule; |
1680 | rules_fx = rules_f0; |
1681 | |
1682 | nft_pipapo_for_each_field(f, i, m) { |
1683 | rulemap[i].to = start; |
1684 | rulemap[i].n = rules_fx; |
1685 | |
1686 | if (i < m->field_count - 1) { |
1687 | rules_fx = f->mt[start].n; |
1688 | start = f->mt[start].to; |
1689 | } |
1690 | } |
1691 | |
1692 | /* Pick the last field, and its last index */ |
1693 | f--; |
1694 | i--; |
1695 | e = f->mt[rulemap[i].to].e; |
1696 | |
1697 | /* synchronous gc never fails, there is no need to set on |
1698 | * NFT_SET_ELEM_DEAD_BIT. |
1699 | */ |
1700 | if (__nft_set_elem_expired(ext: &e->ext, tstamp)) { |
1701 | priv->dirty = true; |
1702 | |
1703 | gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); |
1704 | if (!gc) |
1705 | return; |
1706 | |
1707 | nft_pipapo_gc_deactivate(net, set, e); |
1708 | pipapo_drop(m, rulemap); |
1709 | nft_trans_gc_elem_add(gc, priv: e); |
1710 | |
1711 | /* And check again current first rule, which is now the |
1712 | * first we haven't checked. |
1713 | */ |
1714 | } else { |
1715 | first_rule += rules_f0; |
1716 | } |
1717 | } |
1718 | |
1719 | gc = nft_trans_gc_catchall_sync(gc); |
1720 | if (gc) { |
1721 | nft_trans_gc_queue_sync_done(trans: gc); |
1722 | priv->last_gc = jiffies; |
1723 | } |
1724 | } |
1725 | |
1726 | /** |
1727 | * pipapo_free_fields() - Free per-field tables contained in matching data |
1728 | * @m: Matching data |
1729 | */ |
1730 | static void pipapo_free_fields(struct nft_pipapo_match *m) |
1731 | { |
1732 | struct nft_pipapo_field *f; |
1733 | int i; |
1734 | |
1735 | nft_pipapo_for_each_field(f, i, m) { |
1736 | kvfree(addr: f->lt); |
1737 | kvfree(addr: f->mt); |
1738 | } |
1739 | } |
1740 | |
1741 | static void pipapo_free_match(struct nft_pipapo_match *m) |
1742 | { |
1743 | int i; |
1744 | |
1745 | for_each_possible_cpu(i) |
1746 | pipapo_free_scratch(m, cpu: i); |
1747 | |
1748 | free_percpu(pdata: m->scratch); |
1749 | pipapo_free_fields(m); |
1750 | |
1751 | kfree(objp: m); |
1752 | } |
1753 | |
1754 | /** |
1755 | * pipapo_reclaim_match - RCU callback to free fields from old matching data |
1756 | * @rcu: RCU head |
1757 | */ |
1758 | static void pipapo_reclaim_match(struct rcu_head *rcu) |
1759 | { |
1760 | struct nft_pipapo_match *m; |
1761 | |
1762 | m = container_of(rcu, struct nft_pipapo_match, rcu); |
1763 | pipapo_free_match(m); |
1764 | } |
1765 | |
1766 | /** |
1767 | * nft_pipapo_commit() - Replace lookup data with current working copy |
1768 | * @set: nftables API set representation |
1769 | * |
1770 | * While at it, check if we should perform garbage collection on the working |
1771 | * copy before committing it for lookup, and don't replace the table if the |
1772 | * working copy doesn't have pending changes. |
1773 | * |
1774 | * We also need to create a new working copy for subsequent insertions and |
1775 | * deletions. |
1776 | */ |
1777 | static void nft_pipapo_commit(struct nft_set *set) |
1778 | { |
1779 | struct nft_pipapo *priv = nft_set_priv(set); |
1780 | struct nft_pipapo_match *new_clone, *old; |
1781 | |
1782 | if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) |
1783 | pipapo_gc(set, m: priv->clone); |
1784 | |
1785 | if (!priv->dirty) |
1786 | return; |
1787 | |
1788 | new_clone = pipapo_clone(old: priv->clone); |
1789 | if (IS_ERR(ptr: new_clone)) |
1790 | return; |
1791 | |
1792 | priv->dirty = false; |
1793 | |
1794 | old = rcu_access_pointer(priv->match); |
1795 | rcu_assign_pointer(priv->match, priv->clone); |
1796 | if (old) |
1797 | call_rcu(head: &old->rcu, func: pipapo_reclaim_match); |
1798 | |
1799 | priv->clone = new_clone; |
1800 | } |
1801 | |
1802 | static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) |
1803 | { |
1804 | #ifdef CONFIG_PROVE_LOCKING |
1805 | const struct net *net = read_pnet(pnet: &set->net); |
1806 | |
1807 | return lockdep_is_held(&nft_pernet(net)->commit_mutex); |
1808 | #else |
1809 | return true; |
1810 | #endif |
1811 | } |
1812 | |
1813 | static void nft_pipapo_abort(const struct nft_set *set) |
1814 | { |
1815 | struct nft_pipapo *priv = nft_set_priv(set); |
1816 | struct nft_pipapo_match *new_clone, *m; |
1817 | |
1818 | if (!priv->dirty) |
1819 | return; |
1820 | |
1821 | m = rcu_dereference_protected(priv->match, nft_pipapo_transaction_mutex_held(set)); |
1822 | |
1823 | new_clone = pipapo_clone(old: m); |
1824 | if (IS_ERR(ptr: new_clone)) |
1825 | return; |
1826 | |
1827 | priv->dirty = false; |
1828 | |
1829 | pipapo_free_match(m: priv->clone); |
1830 | priv->clone = new_clone; |
1831 | } |
1832 | |
1833 | /** |
1834 | * nft_pipapo_activate() - Mark element reference as active given key, commit |
1835 | * @net: Network namespace |
1836 | * @set: nftables API set representation |
1837 | * @elem_priv: nftables API element representation containing key data |
1838 | * |
1839 | * On insertion, elements are added to a copy of the matching data currently |
1840 | * in use for lookups, and not directly inserted into current lookup data. Both |
1841 | * nft_pipapo_insert() and nft_pipapo_activate() are called once for each |
1842 | * element, hence we can't purpose either one as a real commit operation. |
1843 | */ |
1844 | static void nft_pipapo_activate(const struct net *net, |
1845 | const struct nft_set *set, |
1846 | struct nft_elem_priv *elem_priv) |
1847 | { |
1848 | struct nft_pipapo_elem *e = nft_elem_priv_cast(priv: elem_priv); |
1849 | |
1850 | nft_clear(net, &e->ext); |
1851 | } |
1852 | |
1853 | /** |
1854 | * pipapo_deactivate() - Check that element is in set, mark as inactive |
1855 | * @net: Network namespace |
1856 | * @set: nftables API set representation |
1857 | * @data: Input key data |
1858 | * @ext: nftables API extension pointer, used to check for end element |
1859 | * |
1860 | * This is a convenience function that can be called from both |
1861 | * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same |
1862 | * operation. |
1863 | * |
1864 | * Return: deactivated element if found, NULL otherwise. |
1865 | */ |
1866 | static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, |
1867 | const u8 *data, const struct nft_set_ext *ext) |
1868 | { |
1869 | struct nft_pipapo_elem *e; |
1870 | |
1871 | e = pipapo_get(net, set, data, genmask: nft_genmask_next(net), |
1872 | tstamp: nft_net_tstamp(net), GFP_KERNEL); |
1873 | if (IS_ERR(ptr: e)) |
1874 | return NULL; |
1875 | |
1876 | nft_set_elem_change_active(net, set, ext: &e->ext); |
1877 | |
1878 | return e; |
1879 | } |
1880 | |
1881 | /** |
1882 | * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive |
1883 | * @net: Network namespace |
1884 | * @set: nftables API set representation |
1885 | * @elem: nftables API element representation containing key data |
1886 | * |
1887 | * Return: deactivated element if found, NULL otherwise. |
1888 | */ |
1889 | static struct nft_elem_priv * |
1890 | nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, |
1891 | const struct nft_set_elem *elem) |
1892 | { |
1893 | const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv: elem->priv); |
1894 | |
1895 | return pipapo_deactivate(net, set, data: (const u8 *)elem->key.val.data, ext); |
1896 | } |
1897 | |
1898 | /** |
1899 | * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive |
1900 | * @net: Network namespace |
1901 | * @set: nftables API set representation |
1902 | * @elem_priv: nftables API element representation containing key data |
1903 | * |
1904 | * This is functionally the same as nft_pipapo_deactivate(), with a slightly |
1905 | * different interface, and it's also called once for each element in a set |
1906 | * being flushed, so we can't implement, strictly speaking, a flush operation, |
1907 | * which would otherwise be as simple as allocating an empty copy of the |
1908 | * matching data. |
1909 | * |
1910 | * Note that we could in theory do that, mark the set as flushed, and ignore |
1911 | * subsequent calls, but we would leak all the elements after the first one, |
1912 | * because they wouldn't then be freed as result of API calls. |
1913 | * |
1914 | * Return: true if element was found and deactivated. |
1915 | */ |
1916 | static void nft_pipapo_flush(const struct net *net, const struct nft_set *set, |
1917 | struct nft_elem_priv *elem_priv) |
1918 | { |
1919 | struct nft_pipapo_elem *e = nft_elem_priv_cast(priv: elem_priv); |
1920 | |
1921 | nft_set_elem_change_active(net, set, ext: &e->ext); |
1922 | } |
1923 | |
1924 | /** |
1925 | * pipapo_get_boundaries() - Get byte interval for associated rules |
1926 | * @f: Field including lookup table |
1927 | * @first_rule: First rule (lowest index) |
1928 | * @rule_count: Number of associated rules |
1929 | * @left: Byte expression for left boundary (start of range) |
1930 | * @right: Byte expression for right boundary (end of range) |
1931 | * |
1932 | * Given the first rule and amount of rules that originated from the same entry, |
1933 | * build the original range associated with the entry, and calculate the length |
1934 | * of the originating netmask. |
1935 | * |
1936 | * In pictures: |
1937 | * |
1938 | * bucket |
1939 | * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
1940 | * 0 1,2 |
1941 | * 1 1,2 |
1942 | * 2 1,2 |
1943 | * 3 1,2 |
1944 | * 4 1,2 |
1945 | * 5 1 2 |
1946 | * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 |
1947 | * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 |
1948 | * |
1949 | * this is the lookup table corresponding to the IPv4 range |
1950 | * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, |
1951 | * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. |
1952 | * |
1953 | * This function fills @left and @right with the byte values of the leftmost |
1954 | * and rightmost bucket indices for the lowest and highest rule indices, |
1955 | * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in |
1956 | * nibbles: |
1957 | * left: < 12, 0, 10, 8, 0, 1, 0, 0 > |
1958 | * right: < 12, 0, 10, 8, 0, 2, 2, 1 > |
1959 | * corresponding to bytes: |
1960 | * left: < 192, 168, 1, 0 > |
1961 | * right: < 192, 168, 2, 1 > |
1962 | * with mask length irrelevant here, unused on return, as the range is already |
1963 | * defined by its start and end points. The mask length is relevant for a single |
1964 | * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore |
1965 | * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes |
1966 | * < 192, 168, 1, 255 >, and the mask length, calculated from the distances |
1967 | * between leftmost and rightmost bucket indices for each group, would be 24. |
1968 | * |
1969 | * Return: mask length, in bits. |
1970 | */ |
1971 | static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, |
1972 | int rule_count, u8 *left, u8 *right) |
1973 | { |
1974 | int g, mask_len = 0, bit_offset = 0; |
1975 | u8 *l = left, *r = right; |
1976 | |
1977 | for (g = 0; g < f->groups; g++) { |
1978 | int b, x0, x1; |
1979 | |
1980 | x0 = -1; |
1981 | x1 = -1; |
1982 | for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { |
1983 | unsigned long *pos; |
1984 | |
1985 | pos = NFT_PIPAPO_LT_ALIGN(f->lt) + |
1986 | (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize; |
1987 | if (test_bit(first_rule, pos) && x0 == -1) |
1988 | x0 = b; |
1989 | if (test_bit(first_rule + rule_count - 1, pos)) |
1990 | x1 = b; |
1991 | } |
1992 | |
1993 | *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset); |
1994 | *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset); |
1995 | |
1996 | bit_offset += f->bb; |
1997 | if (bit_offset >= BITS_PER_BYTE) { |
1998 | bit_offset %= BITS_PER_BYTE; |
1999 | l++; |
2000 | r++; |
2001 | } |
2002 | |
2003 | if (x1 - x0 == 0) |
2004 | mask_len += 4; |
2005 | else if (x1 - x0 == 1) |
2006 | mask_len += 3; |
2007 | else if (x1 - x0 == 3) |
2008 | mask_len += 2; |
2009 | else if (x1 - x0 == 7) |
2010 | mask_len += 1; |
2011 | } |
2012 | |
2013 | return mask_len; |
2014 | } |
2015 | |
2016 | /** |
2017 | * pipapo_match_field() - Match rules against byte ranges |
2018 | * @f: Field including the lookup table |
2019 | * @first_rule: First of associated rules originating from same entry |
2020 | * @rule_count: Amount of associated rules |
2021 | * @start: Start of range to be matched |
2022 | * @end: End of range to be matched |
2023 | * |
2024 | * Return: true on match, false otherwise. |
2025 | */ |
2026 | static bool pipapo_match_field(struct nft_pipapo_field *f, |
2027 | int first_rule, int rule_count, |
2028 | const u8 *start, const u8 *end) |
2029 | { |
2030 | u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; |
2031 | u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; |
2032 | |
2033 | pipapo_get_boundaries(f, first_rule, rule_count, left, right); |
2034 | |
2035 | return !memcmp(p: start, q: left, |
2036 | size: f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) && |
2037 | !memcmp(p: end, q: right, size: f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); |
2038 | } |
2039 | |
2040 | /** |
2041 | * nft_pipapo_remove() - Remove element given key, commit |
2042 | * @net: Network namespace |
2043 | * @set: nftables API set representation |
2044 | * @elem_priv: nftables API element representation containing key data |
2045 | * |
2046 | * Similarly to nft_pipapo_activate(), this is used as commit operation by the |
2047 | * API, but it's called once per element in the pending transaction, so we can't |
2048 | * implement this as a single commit operation. Closest we can get is to remove |
2049 | * the matched element here, if any, and commit the updated matching data. |
2050 | */ |
2051 | static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, |
2052 | struct nft_elem_priv *elem_priv) |
2053 | { |
2054 | struct nft_pipapo *priv = nft_set_priv(set); |
2055 | struct nft_pipapo_match *m = priv->clone; |
2056 | unsigned int rules_f0, first_rule = 0; |
2057 | struct nft_pipapo_elem *e; |
2058 | const u8 *data; |
2059 | |
2060 | e = nft_elem_priv_cast(priv: elem_priv); |
2061 | data = (const u8 *)nft_set_ext_key(ext: &e->ext); |
2062 | |
2063 | while ((rules_f0 = pipapo_rules_same_key(f: m->f, first: first_rule))) { |
2064 | union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; |
2065 | const u8 *match_start, *match_end; |
2066 | struct nft_pipapo_field *f; |
2067 | int i, start, rules_fx; |
2068 | |
2069 | match_start = data; |
2070 | |
2071 | if (nft_set_ext_exists(ext: &e->ext, id: NFT_SET_EXT_KEY_END)) |
2072 | match_end = (const u8 *)nft_set_ext_key_end(ext: &e->ext)->data; |
2073 | else |
2074 | match_end = data; |
2075 | |
2076 | start = first_rule; |
2077 | rules_fx = rules_f0; |
2078 | |
2079 | nft_pipapo_for_each_field(f, i, m) { |
2080 | bool last = i == m->field_count - 1; |
2081 | |
2082 | if (!pipapo_match_field(f, first_rule: start, rule_count: rules_fx, |
2083 | start: match_start, end: match_end)) |
2084 | break; |
2085 | |
2086 | rulemap[i].to = start; |
2087 | rulemap[i].n = rules_fx; |
2088 | |
2089 | rules_fx = f->mt[start].n; |
2090 | start = f->mt[start].to; |
2091 | |
2092 | match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); |
2093 | match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); |
2094 | |
2095 | if (last && f->mt[rulemap[i].to].e == e) { |
2096 | priv->dirty = true; |
2097 | pipapo_drop(m, rulemap); |
2098 | return; |
2099 | } |
2100 | } |
2101 | |
2102 | first_rule += rules_f0; |
2103 | } |
2104 | |
2105 | WARN_ON_ONCE(1); /* elem_priv not found */ |
2106 | } |
2107 | |
2108 | /** |
2109 | * nft_pipapo_walk() - Walk over elements |
2110 | * @ctx: nftables API context |
2111 | * @set: nftables API set representation |
2112 | * @iter: Iterator |
2113 | * |
2114 | * As elements are referenced in the mapping array for the last field, directly |
2115 | * scan that array: there's no need to follow rule mappings from the first |
2116 | * field. |
2117 | */ |
2118 | static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, |
2119 | struct nft_set_iter *iter) |
2120 | { |
2121 | struct nft_pipapo *priv = nft_set_priv(set); |
2122 | const struct nft_pipapo_match *m; |
2123 | const struct nft_pipapo_field *f; |
2124 | unsigned int i, r; |
2125 | |
2126 | WARN_ON_ONCE(iter->type != NFT_ITER_READ && |
2127 | iter->type != NFT_ITER_UPDATE); |
2128 | |
2129 | rcu_read_lock(); |
2130 | if (iter->type == NFT_ITER_READ) |
2131 | m = rcu_dereference(priv->match); |
2132 | else |
2133 | m = priv->clone; |
2134 | |
2135 | if (unlikely(!m)) |
2136 | goto out; |
2137 | |
2138 | for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) |
2139 | ; |
2140 | |
2141 | for (r = 0; r < f->rules; r++) { |
2142 | struct nft_pipapo_elem *e; |
2143 | |
2144 | if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) |
2145 | continue; |
2146 | |
2147 | if (iter->count < iter->skip) |
2148 | goto cont; |
2149 | |
2150 | e = f->mt[r].e; |
2151 | |
2152 | iter->err = iter->fn(ctx, set, iter, &e->priv); |
2153 | if (iter->err < 0) |
2154 | goto out; |
2155 | |
2156 | cont: |
2157 | iter->count++; |
2158 | } |
2159 | |
2160 | out: |
2161 | rcu_read_unlock(); |
2162 | } |
2163 | |
2164 | /** |
2165 | * nft_pipapo_privsize() - Return the size of private data for the set |
2166 | * @nla: netlink attributes, ignored as size doesn't depend on them |
2167 | * @desc: Set description, ignored as size doesn't depend on it |
2168 | * |
2169 | * Return: size of private data for this set implementation, in bytes |
2170 | */ |
2171 | static u64 nft_pipapo_privsize(const struct nlattr * const nla[], |
2172 | const struct nft_set_desc *desc) |
2173 | { |
2174 | return sizeof(struct nft_pipapo); |
2175 | } |
2176 | |
2177 | /** |
2178 | * nft_pipapo_estimate() - Set size, space and lookup complexity |
2179 | * @desc: Set description, element count and field description used |
2180 | * @features: Flags: NFT_SET_INTERVAL needs to be there |
2181 | * @est: Storage for estimation data |
2182 | * |
2183 | * Return: true if set description is compatible, false otherwise |
2184 | */ |
2185 | static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, |
2186 | struct nft_set_estimate *est) |
2187 | { |
2188 | if (!(features & NFT_SET_INTERVAL) || |
2189 | desc->field_count < NFT_PIPAPO_MIN_FIELDS) |
2190 | return false; |
2191 | |
2192 | est->size = pipapo_estimate_size(desc); |
2193 | if (!est->size) |
2194 | return false; |
2195 | |
2196 | est->lookup = NFT_SET_CLASS_O_LOG_N; |
2197 | |
2198 | est->space = NFT_SET_CLASS_O_N; |
2199 | |
2200 | return true; |
2201 | } |
2202 | |
2203 | /** |
2204 | * nft_pipapo_init() - Initialise data for a set instance |
2205 | * @set: nftables API set representation |
2206 | * @desc: Set description |
2207 | * @nla: netlink attributes |
2208 | * |
2209 | * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink |
2210 | * attributes, initialise internal set parameters, current instance of matching |
2211 | * data and a copy for subsequent insertions. |
2212 | * |
2213 | * Return: 0 on success, negative error code on failure. |
2214 | */ |
2215 | static int nft_pipapo_init(const struct nft_set *set, |
2216 | const struct nft_set_desc *desc, |
2217 | const struct nlattr * const nla[]) |
2218 | { |
2219 | struct nft_pipapo *priv = nft_set_priv(set); |
2220 | struct nft_pipapo_match *m; |
2221 | struct nft_pipapo_field *f; |
2222 | int err, i, field_count; |
2223 | |
2224 | BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0); |
2225 | |
2226 | field_count = desc->field_count ? : 1; |
2227 | |
2228 | BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255); |
2229 | BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT); |
2230 | |
2231 | if (field_count > NFT_PIPAPO_MAX_FIELDS) |
2232 | return -EINVAL; |
2233 | |
2234 | m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL); |
2235 | if (!m) |
2236 | return -ENOMEM; |
2237 | |
2238 | m->field_count = field_count; |
2239 | m->bsize_max = 0; |
2240 | |
2241 | m->scratch = alloc_percpu(struct nft_pipapo_scratch *); |
2242 | if (!m->scratch) { |
2243 | err = -ENOMEM; |
2244 | goto out_scratch; |
2245 | } |
2246 | for_each_possible_cpu(i) |
2247 | *per_cpu_ptr(m->scratch, i) = NULL; |
2248 | |
2249 | rcu_head_init(rhp: &m->rcu); |
2250 | |
2251 | nft_pipapo_for_each_field(f, i, m) { |
2252 | unsigned int len = desc->field_len[i] ? : set->klen; |
2253 | |
2254 | /* f->groups is u8 */ |
2255 | BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES * |
2256 | BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256); |
2257 | |
2258 | f->bb = NFT_PIPAPO_GROUP_BITS_INIT; |
2259 | f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); |
2260 | |
2261 | priv->width += round_up(len, sizeof(u32)); |
2262 | |
2263 | f->bsize = 0; |
2264 | f->rules = 0; |
2265 | f->rules_alloc = 0; |
2266 | f->lt = NULL; |
2267 | f->mt = NULL; |
2268 | } |
2269 | |
2270 | /* Create an initial clone of matching data for next insertion */ |
2271 | priv->clone = pipapo_clone(old: m); |
2272 | if (IS_ERR(ptr: priv->clone)) { |
2273 | err = PTR_ERR(ptr: priv->clone); |
2274 | goto out_free; |
2275 | } |
2276 | |
2277 | priv->dirty = false; |
2278 | |
2279 | rcu_assign_pointer(priv->match, m); |
2280 | |
2281 | return 0; |
2282 | |
2283 | out_free: |
2284 | free_percpu(pdata: m->scratch); |
2285 | out_scratch: |
2286 | kfree(objp: m); |
2287 | |
2288 | return err; |
2289 | } |
2290 | |
2291 | /** |
2292 | * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array |
2293 | * @ctx: context |
2294 | * @set: nftables API set representation |
2295 | * @m: matching data pointing to key mapping array |
2296 | */ |
2297 | static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, |
2298 | const struct nft_set *set, |
2299 | struct nft_pipapo_match *m) |
2300 | { |
2301 | struct nft_pipapo_field *f; |
2302 | unsigned int i, r; |
2303 | |
2304 | for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) |
2305 | ; |
2306 | |
2307 | for (r = 0; r < f->rules; r++) { |
2308 | struct nft_pipapo_elem *e; |
2309 | |
2310 | if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) |
2311 | continue; |
2312 | |
2313 | e = f->mt[r].e; |
2314 | |
2315 | nf_tables_set_elem_destroy(ctx, set, elem_priv: &e->priv); |
2316 | } |
2317 | } |
2318 | |
2319 | /** |
2320 | * nft_pipapo_destroy() - Free private data for set and all committed elements |
2321 | * @ctx: context |
2322 | * @set: nftables API set representation |
2323 | */ |
2324 | static void nft_pipapo_destroy(const struct nft_ctx *ctx, |
2325 | const struct nft_set *set) |
2326 | { |
2327 | struct nft_pipapo *priv = nft_set_priv(set); |
2328 | struct nft_pipapo_match *m; |
2329 | int cpu; |
2330 | |
2331 | m = rcu_dereference_protected(priv->match, true); |
2332 | if (m) { |
2333 | rcu_barrier(); |
2334 | |
2335 | for_each_possible_cpu(cpu) |
2336 | pipapo_free_scratch(m, cpu); |
2337 | free_percpu(pdata: m->scratch); |
2338 | pipapo_free_fields(m); |
2339 | kfree(objp: m); |
2340 | priv->match = NULL; |
2341 | } |
2342 | |
2343 | if (priv->clone) { |
2344 | m = priv->clone; |
2345 | |
2346 | nft_set_pipapo_match_destroy(ctx, set, m); |
2347 | |
2348 | for_each_possible_cpu(cpu) |
2349 | pipapo_free_scratch(m: priv->clone, cpu); |
2350 | free_percpu(pdata: priv->clone->scratch); |
2351 | |
2352 | pipapo_free_fields(m: priv->clone); |
2353 | kfree(objp: priv->clone); |
2354 | priv->clone = NULL; |
2355 | } |
2356 | } |
2357 | |
2358 | /** |
2359 | * nft_pipapo_gc_init() - Initialise garbage collection |
2360 | * @set: nftables API set representation |
2361 | * |
2362 | * Instead of actually setting up a periodic work for garbage collection, as |
2363 | * this operation requires a swap of matching data with the working copy, we'll |
2364 | * do that opportunistically with other commit operations if the interval is |
2365 | * elapsed, so we just need to set the current jiffies timestamp here. |
2366 | */ |
2367 | static void nft_pipapo_gc_init(const struct nft_set *set) |
2368 | { |
2369 | struct nft_pipapo *priv = nft_set_priv(set); |
2370 | |
2371 | priv->last_gc = jiffies; |
2372 | } |
2373 | |
2374 | const struct nft_set_type nft_set_pipapo_type = { |
2375 | .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | |
2376 | NFT_SET_TIMEOUT, |
2377 | .ops = { |
2378 | .lookup = nft_pipapo_lookup, |
2379 | .insert = nft_pipapo_insert, |
2380 | .activate = nft_pipapo_activate, |
2381 | .deactivate = nft_pipapo_deactivate, |
2382 | .flush = nft_pipapo_flush, |
2383 | .remove = nft_pipapo_remove, |
2384 | .walk = nft_pipapo_walk, |
2385 | .get = nft_pipapo_get, |
2386 | .privsize = nft_pipapo_privsize, |
2387 | .estimate = nft_pipapo_estimate, |
2388 | .init = nft_pipapo_init, |
2389 | .destroy = nft_pipapo_destroy, |
2390 | .gc_init = nft_pipapo_gc_init, |
2391 | .commit = nft_pipapo_commit, |
2392 | .abort = nft_pipapo_abort, |
2393 | .elemsize = offsetof(struct nft_pipapo_elem, ext), |
2394 | }, |
2395 | }; |
2396 | |
2397 | #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) |
2398 | const struct nft_set_type nft_set_pipapo_avx2_type = { |
2399 | .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | |
2400 | NFT_SET_TIMEOUT, |
2401 | .ops = { |
2402 | .lookup = nft_pipapo_avx2_lookup, |
2403 | .insert = nft_pipapo_insert, |
2404 | .activate = nft_pipapo_activate, |
2405 | .deactivate = nft_pipapo_deactivate, |
2406 | .flush = nft_pipapo_flush, |
2407 | .remove = nft_pipapo_remove, |
2408 | .walk = nft_pipapo_walk, |
2409 | .get = nft_pipapo_get, |
2410 | .privsize = nft_pipapo_privsize, |
2411 | .estimate = nft_pipapo_avx2_estimate, |
2412 | .init = nft_pipapo_init, |
2413 | .destroy = nft_pipapo_destroy, |
2414 | .gc_init = nft_pipapo_gc_init, |
2415 | .commit = nft_pipapo_commit, |
2416 | .abort = nft_pipapo_abort, |
2417 | .elemsize = offsetof(struct nft_pipapo_elem, ext), |
2418 | }, |
2419 | }; |
2420 | #endif |
2421 | |