1 | // -*- c-basic-offset: 2 -*- |
2 | /* |
3 | * This file is part of the KDE libraries |
4 | * Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org) |
5 | * Copyright (C) 2003,2004 Apple Computer, Inc. |
6 | * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org) |
7 | * Copyright (C) 2007 Sune Vuorela (debian@pusling.com) |
8 | * |
9 | * This library is free software; you can redistribute it and/or |
10 | * modify it under the terms of the GNU Lesser General Public |
11 | * License as published by the Free Software Foundation; either |
12 | * version 2 of the License, or (at your option) any later version. |
13 | * |
14 | * This library is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | * Lesser General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU Lesser General Public |
20 | * License along with this library; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | * |
23 | */ |
24 | |
25 | #include "regexp.h" |
26 | #include <config-kjs.h> |
27 | #include "lexer.h" |
28 | |
29 | #include <assert.h> |
30 | #include <stdio.h> |
31 | #include <stdlib.h> |
32 | #include <string.h> |
33 | #include <wtf/Vector.h> |
34 | |
35 | #if defined _WIN32 || defined _WIN64 |
36 | #undef HAVE_SYS_TIME_H |
37 | #endif |
38 | #if HAVE(SYS_TIME_H) |
39 | #include <sys/time.h> |
40 | #include <sys/resource.h> |
41 | |
42 | static const rlim_t sWantedStackSizeLimit = 32*1024*1024; |
43 | |
44 | #endif |
45 | |
46 | using WTF::Vector; |
47 | |
48 | // GCC cstring uses these automatically, but not all implementations do. |
49 | using std::strlen; |
50 | using std::strcpy; |
51 | using std::strncpy; |
52 | using std::memset; |
53 | using std::memcpy; |
54 | |
55 | namespace KJS { |
56 | |
57 | RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown; |
58 | |
59 | static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx = 0); |
60 | |
61 | // JS regexps can contain Unicode escape sequences (\uxxxx) which |
62 | // are rather uncommon elsewhere. As our regexp libs don't understand |
63 | // them we do the unescaping ourselves internally. |
64 | // Also make sure to expand out any nulls as pcre_compile |
65 | // expects null termination.. |
66 | static UString sanitizePattern(const UString &p) |
67 | { |
68 | UString np; |
69 | |
70 | bool changed = false; |
71 | const char* const nil = "\\x00" ; |
72 | if (p.find("\\u" ) >= 0 || p.find(KJS::UChar('\0')) >= 0) { |
73 | bool escape = false; |
74 | changed = true; |
75 | for (int i = 0; i < p.size(); ++i) { |
76 | UChar c = p[i]; |
77 | if (escape) { |
78 | escape = false; |
79 | // we only care about \u |
80 | if (c == 'u') { |
81 | // standard unicode escape sequence looks like \uxxxx but |
82 | // other browsers also accept less than 4 hex digits |
83 | unsigned short u = 0; |
84 | int j = 0; |
85 | for (j = 0; j < 4; ++j) { |
86 | if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) { |
87 | u = (u << 4) + Lexer::convertHex(p[i + 1].unicode()); |
88 | ++i; |
89 | } else { |
90 | // sequence incomplete. restore index. |
91 | // TODO: cleaner way to propagate warning |
92 | fprintf(stderr, "KJS: saw %d digit \\u sequence.\n" , j); |
93 | i -= j; |
94 | break; |
95 | } |
96 | } |
97 | if (j < 4) { |
98 | // sequence was incomplete. treat \u as u which IE always |
99 | // and FF sometimes does. |
100 | np.append(UString('u')); |
101 | } else { |
102 | c = UChar(u); |
103 | switch (u) { |
104 | case 0: |
105 | // Make sure to encode 0, to avoid terminating the string |
106 | np += UString(nil); |
107 | break; |
108 | case '^': |
109 | case '$': |
110 | case '\\': |
111 | case '.': |
112 | case '*': |
113 | case '+': |
114 | case '?': |
115 | case '(': case ')': |
116 | case '{': case '}': |
117 | case '[': case ']': |
118 | case '|': |
119 | // escape pattern characters have to remain escaped |
120 | np.append(UString('\\')); |
121 | // intentional fallthrough |
122 | default: |
123 | np += UString(&c, 1); |
124 | break; |
125 | } |
126 | } |
127 | continue; |
128 | } |
129 | np += UString('\\'); |
130 | np += UString(&c, 1); |
131 | } else { |
132 | if (c == '\\') |
133 | escape = true; |
134 | else if (c == '\0') |
135 | np += UString(nil); |
136 | else |
137 | np += UString(&c, 1); |
138 | } |
139 | } |
140 | } |
141 | // Rewrite very inefficient RE formulation: |
142 | // (.|\s)+ is often used instead of the less intuitive, but vastly preferable [\w\W]+ |
143 | // The first wording needs to recurse at each character matched in libPCRE, leading to rapid exhaustion of stack space. |
144 | if (p.find(".|\\s)" )>=0) { |
145 | if (np.isEmpty()) |
146 | np = p; |
147 | bool didRewrite = false; |
148 | WTF::Vector<int> parenIdx; |
149 | sanitizePatternExtensions(np, &parenIdx); |
150 | Vector<int>::const_iterator end = parenIdx.end(); |
151 | int previdx = 0; |
152 | UString tmp; |
153 | bool nonCapturing = false; |
154 | for (Vector<int>::const_iterator it = parenIdx.begin(); it != end; ++it) { |
155 | int idx = *it; |
156 | if (np.size() < idx+6) |
157 | break; |
158 | if (np[idx+1] == '?' && np[idx+2] == ':') { |
159 | nonCapturing = true; |
160 | idx += 3; |
161 | } else { |
162 | ++idx; |
163 | } |
164 | if (!(np[idx] == '.' && np[idx+1] == '|' && np[idx+2] == '\\' && np[idx+3] == 's')) |
165 | continue; |
166 | if (np.size() >= idx+6 && (np[idx+5] == '+' || (np[idx+5] == '*')) && |
167 | // no need to do anything if the pattern is minimal e.g. (.|\s)+? |
168 | !(np.size() > idx+6 && np[idx+6] == '?')) { |
169 | didRewrite = true; |
170 | if (nonCapturing) { // trivial case: (?:.|\s)+ => [\w\W]+ |
171 | tmp.append(np, previdx, idx-previdx-3); |
172 | tmp.append("[\\w\\W]" ); |
173 | tmp.append(np[idx+5]); |
174 | } else if (np[idx+5] == '*') { // capture zero of one or more: (.|\s)* => (?:[\w\W]*([\w\W])|[\w\W]?) |
175 | tmp.append(np, previdx, idx-previdx-1); |
176 | tmp.append("(?:[\\w\\W]*([\\w\\W])|[\\w\\W]?)" ); |
177 | } else { // capture last of one or more: (.|\s)+ => [\w\W]*([\w\W]) |
178 | assert(np[idx+5] == '+'); |
179 | tmp.append(np, previdx, idx-previdx-1); |
180 | tmp.append("[\\w\\W]*([\\w\\W])" ); |
181 | } |
182 | } else { |
183 | tmp.append(np, previdx, idx-previdx+5); |
184 | } |
185 | previdx = idx+6; |
186 | } |
187 | if (didRewrite) { |
188 | tmp.append(np, previdx); |
189 | fprintf(stderr, "Pattern: %s " , np.ascii()); |
190 | fprintf(stderr, "was rewritten to: %s\n" , tmp.ascii()); |
191 | np = tmp; |
192 | changed = true; |
193 | } |
194 | } |
195 | return (changed ? np : p); |
196 | } |
197 | |
198 | // For now, the only 'extension' to standard we are willing to deal with is |
199 | // a non-escaped closing bracket, outside of a character class. e.g. /.*]/ |
200 | static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx) |
201 | { |
202 | UString newPattern; |
203 | |
204 | static const int StateNominal = 0, StateOpenBracket = 1; |
205 | WTF::Vector<int> v; |
206 | bool escape = false; |
207 | |
208 | int state = StateNominal; |
209 | int escapedSinceLastParen = 0; |
210 | for (int i = 0; i < p.size(); ++i) { |
211 | UChar c = p[i]; |
212 | if (escape) { |
213 | escape = false; |
214 | } else { |
215 | if (c == '\\') { |
216 | escape = true; |
217 | } else if (c == ']') { |
218 | if (state == StateOpenBracket) { |
219 | state = StateNominal; |
220 | } else if (state == StateNominal) { |
221 | v.append(i); |
222 | ++escapedSinceLastParen; |
223 | } |
224 | } else if (c == '[') { |
225 | if (state == StateOpenBracket) { |
226 | v.append(i); |
227 | ++escapedSinceLastParen; |
228 | } else if (state == StateNominal) { |
229 | state = StateOpenBracket; |
230 | } |
231 | } else if (c == '(') { |
232 | if (parenIdx && state == StateNominal) { |
233 | parenIdx->append(i+escapedSinceLastParen); |
234 | escapedSinceLastParen = 0; |
235 | } |
236 | } |
237 | } |
238 | } |
239 | if (state == StateOpenBracket) { |
240 | // this is not recoverable. |
241 | return false; |
242 | } |
243 | if (v.size()) { |
244 | int pos=0; |
245 | Vector<int>::const_iterator end = v.end(); |
246 | for (Vector<int>::const_iterator it = v.begin(); it != end; ++it) { |
247 | newPattern += p.substr(pos, *it-pos); |
248 | pos = *it; |
249 | newPattern += UString('\\'); |
250 | } |
251 | newPattern += p.substr(pos); |
252 | p = newPattern; |
253 | return true; |
254 | } else { |
255 | return false; |
256 | } |
257 | } |
258 | |
259 | bool RegExp::tryGrowingMaxStackSize = true; |
260 | bool RegExp::didIncreaseMaxStackSize = false; |
261 | |
262 | #if HAVE(SYS_TIME_H) |
263 | rlim_t RegExp::availableStackSize = 8*1024*1024; |
264 | #else |
265 | int RegExp::availableStackSize = 8*1024*1024; |
266 | #endif |
267 | |
268 | RegExp::RegExp(const UString &p, char flags) |
269 | : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0) |
270 | { |
271 | #ifdef HAVE_PCREPOSIX |
272 | // Determine whether libpcre has unicode support if need be.. |
273 | if (utf8Support == Unknown) { |
274 | int supported; |
275 | pcre_config(PCRE_CONFIG_UTF8, (void*)&supported); |
276 | utf8Support = supported ? Supported : Unsupported; |
277 | } |
278 | #endif |
279 | |
280 | UString intern = sanitizePattern(p); |
281 | |
282 | #ifdef HAVE_PCREPOSIX |
283 | int options = 0; |
284 | |
285 | // we are close but not 100% the same as Perl |
286 | #ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7 |
287 | options |= PCRE_JAVASCRIPT_COMPAT; |
288 | #endif |
289 | |
290 | // Note: the Global flag is already handled by RegExpProtoFunc::execute. |
291 | // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute. |
292 | if (flags & IgnoreCase) |
293 | options |= PCRE_CASELESS; |
294 | if (flags & Multiline) |
295 | options |= PCRE_MULTILINE; |
296 | |
297 | if (utf8Support == Supported) |
298 | options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK); |
299 | |
300 | const char *errorMessage; |
301 | int errorOffset; |
302 | bool secondTry = false; |
303 | |
304 | while (1) { |
305 | RegExpStringContext converted(intern); |
306 | |
307 | _regex = pcre_compile(converted.buffer(), options, &errorMessage, &errorOffset, NULL); |
308 | |
309 | if (!_regex) { |
310 | #ifdef PCRE_JAVASCRIPT_COMPAT |
311 | // The compilation failed. It is likely the pattern contains non-standard extensions. |
312 | // We may try to tolerate some of those extensions. |
313 | bool doRecompile = !secondTry && sanitizePatternExtensions(intern); |
314 | if (doRecompile) { |
315 | secondTry = true; |
316 | #ifndef NDEBUG |
317 | fprintf(stderr, "KJS: pcre_compile() failed with '%s' - non-standard extensions detected in pattern, trying second compile after correction.\n" , errorMessage); |
318 | #endif |
319 | continue; |
320 | } |
321 | #endif |
322 | #ifndef NDEBUG |
323 | fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n" , errorMessage); |
324 | #endif |
325 | _valid = false; |
326 | return; |
327 | } |
328 | break; |
329 | } |
330 | |
331 | #ifdef PCRE_INFO_CAPTURECOUNT |
332 | // Get number of subpatterns that will be returned. |
333 | pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns); |
334 | #endif |
335 | |
336 | #else /* HAVE_PCREPOSIX */ |
337 | |
338 | int regflags = 0; |
339 | #ifdef REG_EXTENDED |
340 | regflags |= REG_EXTENDED; |
341 | #endif |
342 | #ifdef REG_ICASE |
343 | if ( flags & IgnoreCase ) |
344 | regflags |= REG_ICASE; |
345 | #endif |
346 | |
347 | //NOTE: Multiline is not feasible with POSIX regex. |
348 | //if ( f & Multiline ) |
349 | // ; |
350 | // Note: the Global flag is already handled by RegExpProtoFunc::execute |
351 | |
352 | int errorCode = regcomp(&_regex, intern.ascii(), regflags); |
353 | if (errorCode != 0) { |
354 | #ifndef NDEBUG |
355 | char errorMessage[80]; |
356 | regerror(errorCode, &_regex, errorMessage, sizeof errorMessage); |
357 | fprintf(stderr, "KJS: regcomp failed with '%s'\n" , errorMessage); |
358 | #endif |
359 | _valid = false; |
360 | } |
361 | #endif |
362 | } |
363 | |
364 | RegExp::~RegExp() |
365 | { |
366 | #ifdef HAVE_PCREPOSIX |
367 | pcre_free(_regex); |
368 | #else |
369 | /* TODO: is this really okay after an error ? */ |
370 | regfree(&_regex); |
371 | #endif |
372 | } |
373 | |
374 | void RegExpStringContext::prepareUtf8(const UString& s) |
375 | { |
376 | // Allocate a buffer big enough to hold all the characters plus \0 |
377 | const int length = s.size(); |
378 | _buffer = new char[length * 3 + 1]; |
379 | |
380 | // Also create buffer for positions. We need one extra character in there, |
381 | // even past the \0 since the non-empty handling may jump one past the end |
382 | _originalPos = new int[length * 3 + 2]; |
383 | |
384 | // Convert to runs of 8-bit characters, and generate indices |
385 | // Note that we do NOT combine surrogate pairs here, as |
386 | // regexps operate on them as separate characters |
387 | char *p = _buffer; |
388 | int *posOut = _originalPos; |
389 | const UChar *d = s.data(); |
390 | for (int i = 0; i != length; ++i) { |
391 | unsigned short c = d[i].unicode(); |
392 | |
393 | int sequenceLen; |
394 | if (c < 0x80) { |
395 | *p++ = (char)c; |
396 | sequenceLen = 1; |
397 | } else if (c < 0x800) { |
398 | *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 |
399 | *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set |
400 | sequenceLen = 2; |
401 | } else { |
402 | *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 |
403 | *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set |
404 | *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set |
405 | sequenceLen = 3; |
406 | } |
407 | |
408 | while (sequenceLen > 0) { |
409 | *posOut = i; |
410 | ++posOut; |
411 | --sequenceLen; |
412 | } |
413 | } |
414 | |
415 | _bufferSize = p - _buffer; |
416 | |
417 | *p++ = '\0'; |
418 | |
419 | // Record positions for \0, and the fictional character after that. |
420 | *posOut = length; |
421 | *(posOut+1) = length+1; |
422 | } |
423 | |
424 | void RegExpStringContext::prepareASCII (const UString& s) |
425 | { |
426 | _originalPos = 0; |
427 | |
428 | // Best-effort attempt to get something done |
429 | // when we don't have utf 8 available -- use |
430 | // truncated version, and pray for the best |
431 | CString truncated = s.cstring(); |
432 | _buffer = new char[truncated.size() + 1]; |
433 | memcpy(_buffer, truncated.c_str(), truncated.size()); |
434 | _buffer[truncated.size()] = '\0'; // For _compile use |
435 | _bufferSize = truncated.size(); |
436 | } |
437 | |
438 | RegExpStringContext::RegExpStringContext(const UString &s) |
439 | { |
440 | #ifndef NDEBUG |
441 | _originalS = s; |
442 | #endif |
443 | |
444 | if (RegExp::utf8Support == RegExp::Supported) |
445 | prepareUtf8(s); |
446 | else |
447 | prepareASCII(s); |
448 | } |
449 | |
450 | RegExpStringContext::~RegExpStringContext() |
451 | { |
452 | delete[] _originalPos; _originalPos = 0; |
453 | delete[] _buffer; _buffer = 0; |
454 | } |
455 | |
456 | UString RegExp::match(const RegExpStringContext& ctx, const UString &s, bool *error, int i, int *pos, int **ovector) |
457 | { |
458 | #ifndef NDEBUG |
459 | assert(s.data() == ctx._originalS.data()); // Make sure the context is right.. |
460 | #endif |
461 | |
462 | if (i < 0) |
463 | i = 0; |
464 | int dummyPos; |
465 | if (!pos) |
466 | pos = &dummyPos; |
467 | *pos = -1; |
468 | if (ovector) |
469 | *ovector = 0; |
470 | |
471 | if (i > s.size() || s.isNull()) |
472 | return UString::null(); |
473 | |
474 | #ifdef HAVE_PCREPOSIX |
475 | |
476 | if (!_regex) |
477 | return UString::null(); |
478 | |
479 | // Set up the offset vector for the result. |
480 | // First 2/3 used for result, the last third used by PCRE. |
481 | int *offsetVector; |
482 | int offsetVectorSize; |
483 | int fixedSizeOffsetVector[3]; |
484 | if (!ovector) { |
485 | offsetVectorSize = 3; |
486 | offsetVector = fixedSizeOffsetVector; |
487 | } else { |
488 | offsetVectorSize = (_numSubPatterns + 1) * 3; |
489 | offsetVector = new int [offsetVectorSize]; |
490 | } |
491 | |
492 | int startPos; |
493 | if (utf8Support == Supported) { |
494 | startPos = i; |
495 | while (ctx.originalPos(startPos) < i) |
496 | ++startPos; |
497 | } else { |
498 | startPos = i; |
499 | } |
500 | |
501 | int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0; |
502 | |
503 | // See if we have to limit stack space... |
504 | *error = false; |
505 | int stackGlutton = 0; |
506 | pcre_config(PCRE_CONFIG_STACKRECURSE, (void*)&stackGlutton); |
507 | pcre_extra limits; |
508 | if (stackGlutton) { |
509 | #if HAVE(SYS_TIME_H) |
510 | if (tryGrowingMaxStackSize) { |
511 | rlimit l; |
512 | getrlimit(RLIMIT_STACK, &l); |
513 | availableStackSize = l.rlim_cur; |
514 | if (l.rlim_cur < sWantedStackSizeLimit && |
515 | (l.rlim_max > l.rlim_cur || l.rlim_max == RLIM_INFINITY)) { |
516 | l.rlim_cur = (l.rlim_max == RLIM_INFINITY) ? |
517 | sWantedStackSizeLimit : std::min(l.rlim_max, sWantedStackSizeLimit); |
518 | if ((didIncreaseMaxStackSize = !setrlimit( RLIMIT_STACK, &l))) |
519 | availableStackSize = l.rlim_cur; |
520 | } |
521 | tryGrowingMaxStackSize = false; |
522 | } |
523 | #endif |
524 | |
525 | limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION; |
526 | // libPCRE docs claim that it munches about 500 bytes per recursion. |
527 | // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes |
528 | // (and I've measured 800 in an another instance) |
529 | // We go somewhat conservative, and use about 3/4ths of that, |
530 | // especially since we're not exactly light on the stack, either |
531 | limits.match_limit_recursion = (availableStackSize/1300)*3/4; |
532 | } |
533 | |
534 | const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : 0, ctx.buffer(), |
535 | ctx.bufferSize(), startPos, baseFlags, offsetVector, offsetVectorSize); |
536 | |
537 | //Now go through and patch up the offsetVector |
538 | if (utf8Support == Supported) |
539 | for (int c = 0; c < 2 * numMatches; ++c) |
540 | if (offsetVector[c] != -1) |
541 | offsetVector[c] = ctx.originalPos(offsetVector[c]); |
542 | |
543 | if (numMatches < 0) { |
544 | #ifndef NDEBUG |
545 | if (numMatches != PCRE_ERROR_NOMATCH) |
546 | fprintf(stderr, "KJS: pcre_exec() failed with result %d\n" , numMatches); |
547 | #endif |
548 | if (offsetVector != fixedSizeOffsetVector) |
549 | delete [] offsetVector; |
550 | if (numMatches == PCRE_ERROR_MATCHLIMIT || numMatches == PCRE_ERROR_RECURSIONLIMIT) |
551 | *error = true; |
552 | return UString::null(); |
553 | } |
554 | |
555 | *pos = offsetVector[0]; |
556 | if (ovector) |
557 | *ovector = offsetVector; |
558 | return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]); |
559 | |
560 | #else |
561 | |
562 | if (!_valid) |
563 | return UString::null(); |
564 | |
565 | const unsigned maxMatch = 10; |
566 | regmatch_t rmatch[maxMatch]; |
567 | |
568 | char *str = strdup(s.ascii()); // TODO: why ??? |
569 | if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) { |
570 | free(str); |
571 | return UString::null(); |
572 | } |
573 | free(str); |
574 | |
575 | if (!ovector) { |
576 | *pos = rmatch[0].rm_so + i; |
577 | return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so); |
578 | } |
579 | |
580 | // map rmatch array to ovector used in PCRE case |
581 | _numSubPatterns = 0; |
582 | for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) |
583 | _numSubPatterns++; |
584 | int ovecsize = (_numSubPatterns+1)*3; // see above |
585 | *ovector = new int[ovecsize]; |
586 | for (unsigned j = 0; j < _numSubPatterns + 1; j++) { |
587 | if (j>maxMatch) |
588 | break; |
589 | (*ovector)[2*j] = rmatch[j].rm_so + i; |
590 | (*ovector)[2*j+1] = rmatch[j].rm_eo + i; |
591 | } |
592 | |
593 | *pos = (*ovector)[0]; |
594 | return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]); |
595 | |
596 | #endif |
597 | } |
598 | |
599 | } // namespace KJS |
600 | |