1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * lib/ts_fsm.c A naive finite state machine text search approach |
4 | * |
5 | * Authors: Thomas Graf <tgraf@suug.ch> |
6 | * |
7 | * ========================================================================== |
8 | * |
9 | * A finite state machine consists of n states (struct ts_fsm_token) |
10 | * representing the pattern as a finite automaton. The data is read |
11 | * sequentially on an octet basis. Every state token specifies the number |
12 | * of recurrences and the type of value accepted which can be either a |
13 | * specific character or ctype based set of characters. The available |
14 | * type of recurrences include 1, (0|1), [0 n], and [1 n]. |
15 | * |
16 | * The algorithm differs between strict/non-strict mode specifying |
17 | * whether the pattern has to start at the first octet. Strict mode |
18 | * is enabled by default and can be disabled by inserting |
19 | * TS_FSM_HEAD_IGNORE as the first token in the chain. |
20 | * |
21 | * The runtime performance of the algorithm should be around O(n), |
22 | * however while in strict mode the average runtime can be better. |
23 | */ |
24 | |
25 | #include <linux/module.h> |
26 | #include <linux/types.h> |
27 | #include <linux/string.h> |
28 | #include <linux/ctype.h> |
29 | #include <linux/textsearch.h> |
30 | #include <linux/textsearch_fsm.h> |
31 | |
32 | struct ts_fsm |
33 | { |
34 | unsigned int ntokens; |
35 | struct ts_fsm_token tokens[]; |
36 | }; |
37 | |
38 | /* other values derived from ctype.h */ |
39 | #define _A 0x100 /* ascii */ |
40 | #define _W 0x200 /* wildcard */ |
41 | |
42 | /* Map to _ctype flags and some magic numbers */ |
43 | static const u16 token_map[TS_FSM_TYPE_MAX+1] = { |
44 | [TS_FSM_SPECIFIC] = 0, |
45 | [TS_FSM_WILDCARD] = _W, |
46 | [TS_FSM_CNTRL] = _C, |
47 | [TS_FSM_LOWER] = _L, |
48 | [TS_FSM_UPPER] = _U, |
49 | [TS_FSM_PUNCT] = _P, |
50 | [TS_FSM_SPACE] = _S, |
51 | [TS_FSM_DIGIT] = _D, |
52 | [TS_FSM_XDIGIT] = _D | _X, |
53 | [TS_FSM_ALPHA] = _U | _L, |
54 | [TS_FSM_ALNUM] = _U | _L | _D, |
55 | [TS_FSM_PRINT] = _P | _U | _L | _D | _SP, |
56 | [TS_FSM_GRAPH] = _P | _U | _L | _D, |
57 | [TS_FSM_ASCII] = _A, |
58 | }; |
59 | |
60 | static const u16 token_lookup_tbl[256] = { |
61 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */ |
62 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */ |
63 | _W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */ |
64 | _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */ |
65 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */ |
66 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */ |
67 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */ |
68 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */ |
69 | _W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */ |
70 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */ |
71 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */ |
72 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */ |
73 | _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */ |
74 | _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */ |
75 | _W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */ |
76 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */ |
77 | _W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */ |
78 | _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */ |
79 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */ |
80 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */ |
81 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */ |
82 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */ |
83 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */ |
84 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */ |
85 | _W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */ |
86 | _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */ |
87 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */ |
88 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */ |
89 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */ |
90 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */ |
91 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */ |
92 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */ |
93 | _W, _W, _W, _W, /* 128-131 */ |
94 | _W, _W, _W, _W, /* 132-135 */ |
95 | _W, _W, _W, _W, /* 136-139 */ |
96 | _W, _W, _W, _W, /* 140-143 */ |
97 | _W, _W, _W, _W, /* 144-147 */ |
98 | _W, _W, _W, _W, /* 148-151 */ |
99 | _W, _W, _W, _W, /* 152-155 */ |
100 | _W, _W, _W, _W, /* 156-159 */ |
101 | _W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */ |
102 | _W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */ |
103 | _W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */ |
104 | _W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */ |
105 | _W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */ |
106 | _W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */ |
107 | _W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */ |
108 | _W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */ |
109 | _W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */ |
110 | _W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */ |
111 | _W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */ |
112 | _W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */ |
113 | _W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */ |
114 | _W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */ |
115 | _W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */ |
116 | _W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */ |
117 | _W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */ |
118 | _W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */ |
119 | _W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */ |
120 | _W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */ |
121 | _W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */ |
122 | _W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */ |
123 | _W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */ |
124 | _W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */ |
125 | |
126 | static inline int match_token(struct ts_fsm_token *t, u8 d) |
127 | { |
128 | if (t->type) |
129 | return (token_lookup_tbl[d] & t->type) != 0; |
130 | else |
131 | return t->value == d; |
132 | } |
133 | |
134 | static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state) |
135 | { |
136 | struct ts_fsm *fsm = ts_config_priv(conf); |
137 | struct ts_fsm_token *cur = NULL, *next; |
138 | unsigned int match_start, block_idx = 0, tok_idx; |
139 | unsigned block_len = 0, strict, consumed = state->offset; |
140 | const u8 *data; |
141 | |
142 | #define GET_NEXT_BLOCK() \ |
143 | ({ consumed += block_idx; \ |
144 | block_idx = 0; \ |
145 | block_len = conf->get_next_block(consumed, &data, conf, state); }) |
146 | |
147 | #define TOKEN_MISMATCH() \ |
148 | do { \ |
149 | if (strict) \ |
150 | goto no_match; \ |
151 | block_idx++; \ |
152 | goto startover; \ |
153 | } while(0) |
154 | |
155 | #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK()) |
156 | |
157 | if (end_of_data()) |
158 | goto no_match; |
159 | |
160 | strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE; |
161 | |
162 | startover: |
163 | match_start = consumed + block_idx; |
164 | |
165 | for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) { |
166 | cur = &fsm->tokens[tok_idx]; |
167 | |
168 | if (likely(tok_idx < (fsm->ntokens - 1))) |
169 | next = &fsm->tokens[tok_idx + 1]; |
170 | else |
171 | next = NULL; |
172 | |
173 | switch (cur->recur) { |
174 | case TS_FSM_SINGLE: |
175 | if (end_of_data()) |
176 | goto no_match; |
177 | |
178 | if (!match_token(t: cur, d: data[block_idx])) |
179 | TOKEN_MISMATCH(); |
180 | break; |
181 | |
182 | case TS_FSM_PERHAPS: |
183 | if (end_of_data() || |
184 | !match_token(t: cur, d: data[block_idx])) |
185 | continue; |
186 | break; |
187 | |
188 | case TS_FSM_MULTI: |
189 | if (end_of_data()) |
190 | goto no_match; |
191 | |
192 | if (!match_token(t: cur, d: data[block_idx])) |
193 | TOKEN_MISMATCH(); |
194 | |
195 | block_idx++; |
196 | fallthrough; |
197 | |
198 | case TS_FSM_ANY: |
199 | if (next == NULL) |
200 | goto found_match; |
201 | |
202 | if (end_of_data()) |
203 | continue; |
204 | |
205 | while (!match_token(t: next, d: data[block_idx])) { |
206 | if (!match_token(t: cur, d: data[block_idx])) |
207 | TOKEN_MISMATCH(); |
208 | block_idx++; |
209 | if (end_of_data()) |
210 | goto no_match; |
211 | } |
212 | continue; |
213 | |
214 | /* |
215 | * Optimization: Prefer small local loop over jumping |
216 | * back and forth until garbage at head is munched. |
217 | */ |
218 | case TS_FSM_HEAD_IGNORE: |
219 | if (end_of_data()) |
220 | continue; |
221 | |
222 | while (!match_token(t: next, d: data[block_idx])) { |
223 | /* |
224 | * Special case, don't start over upon |
225 | * a mismatch, give the user the |
226 | * chance to specify the type of data |
227 | * allowed to be ignored. |
228 | */ |
229 | if (!match_token(t: cur, d: data[block_idx])) |
230 | goto no_match; |
231 | |
232 | block_idx++; |
233 | if (end_of_data()) |
234 | goto no_match; |
235 | } |
236 | |
237 | match_start = consumed + block_idx; |
238 | continue; |
239 | } |
240 | |
241 | block_idx++; |
242 | } |
243 | |
244 | if (end_of_data()) |
245 | goto found_match; |
246 | |
247 | no_match: |
248 | return UINT_MAX; |
249 | |
250 | found_match: |
251 | state->offset = consumed + block_idx; |
252 | return match_start; |
253 | } |
254 | |
255 | static struct ts_config *fsm_init(const void *pattern, unsigned int len, |
256 | gfp_t gfp_mask, int flags) |
257 | { |
258 | int i, err = -EINVAL; |
259 | struct ts_config *conf; |
260 | struct ts_fsm *fsm; |
261 | struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern; |
262 | unsigned int ntokens = len / sizeof(*tokens); |
263 | size_t priv_size = sizeof(*fsm) + len; |
264 | |
265 | if (len % sizeof(struct ts_fsm_token) || ntokens < 1) |
266 | goto errout; |
267 | |
268 | if (flags & TS_IGNORECASE) |
269 | goto errout; |
270 | |
271 | for (i = 0; i < ntokens; i++) { |
272 | struct ts_fsm_token *t = &tokens[i]; |
273 | |
274 | if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX) |
275 | goto errout; |
276 | |
277 | if (t->recur == TS_FSM_HEAD_IGNORE && |
278 | (i != 0 || i == (ntokens - 1))) |
279 | goto errout; |
280 | } |
281 | |
282 | conf = alloc_ts_config(payload: priv_size, gfp_mask); |
283 | if (IS_ERR(ptr: conf)) |
284 | return conf; |
285 | |
286 | conf->flags = flags; |
287 | fsm = ts_config_priv(conf); |
288 | fsm->ntokens = ntokens; |
289 | memcpy(fsm->tokens, pattern, len); |
290 | |
291 | for (i = 0; i < fsm->ntokens; i++) { |
292 | struct ts_fsm_token *t = &fsm->tokens[i]; |
293 | t->type = token_map[t->type]; |
294 | } |
295 | |
296 | return conf; |
297 | |
298 | errout: |
299 | return ERR_PTR(error: err); |
300 | } |
301 | |
302 | static void *fsm_get_pattern(struct ts_config *conf) |
303 | { |
304 | struct ts_fsm *fsm = ts_config_priv(conf); |
305 | return fsm->tokens; |
306 | } |
307 | |
308 | static unsigned int fsm_get_pattern_len(struct ts_config *conf) |
309 | { |
310 | struct ts_fsm *fsm = ts_config_priv(conf); |
311 | return fsm->ntokens * sizeof(struct ts_fsm_token); |
312 | } |
313 | |
314 | static struct ts_ops fsm_ops = { |
315 | .name = "fsm" , |
316 | .find = fsm_find, |
317 | .init = fsm_init, |
318 | .get_pattern = fsm_get_pattern, |
319 | .get_pattern_len = fsm_get_pattern_len, |
320 | .owner = THIS_MODULE, |
321 | .list = LIST_HEAD_INIT(fsm_ops.list) |
322 | }; |
323 | |
324 | static int __init init_fsm(void) |
325 | { |
326 | return textsearch_register(&fsm_ops); |
327 | } |
328 | |
329 | static void __exit exit_fsm(void) |
330 | { |
331 | textsearch_unregister(&fsm_ops); |
332 | } |
333 | |
334 | MODULE_LICENSE("GPL" ); |
335 | |
336 | module_init(init_fsm); |
337 | module_exit(exit_fsm); |
338 | |