1// -*- c-basic-offset: 2 -*-
2/*
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
5 * Copyright (C) 2003,2004 Apple Computer, Inc.
6 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
7 * Copyright (C) 2007 Sune Vuorela (debian@pusling.com)
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 *
23 */
24
25#include "regexp.h"
26#include <config-kjs.h>
27#include "lexer.h"
28
29#include <assert.h>
30#include <stdio.h>
31#include <stdlib.h>
32#include <string.h>
33#include <wtf/Vector.h>
34
35#if defined _WIN32 || defined _WIN64
36#undef HAVE_SYS_TIME_H
37#endif
38#if HAVE(SYS_TIME_H)
39#include <sys/time.h>
40#include <sys/resource.h>
41
42static const rlim_t sWantedStackSizeLimit = 32*1024*1024;
43
44#endif
45
46using WTF::Vector;
47
48// GCC cstring uses these automatically, but not all implementations do.
49using std::strlen;
50using std::strcpy;
51using std::strncpy;
52using std::memset;
53using std::memcpy;
54
55namespace KJS {
56
57RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
58
59static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx = 0);
60
61// JS regexps can contain Unicode escape sequences (\uxxxx) which
62// are rather uncommon elsewhere. As our regexp libs don't understand
63// them we do the unescaping ourselves internally.
64// Also make sure to expand out any nulls as pcre_compile
65// expects null termination..
66static UString sanitizePattern(const UString &p)
67{
68 UString np;
69
70 bool changed = false;
71 const char* const nil = "\\x00";
72 if (p.find("\\u") >= 0 || p.find(KJS::UChar('\0')) >= 0) {
73 bool escape = false;
74 changed = true;
75 for (int i = 0; i < p.size(); ++i) {
76 UChar c = p[i];
77 if (escape) {
78 escape = false;
79 // we only care about \u
80 if (c == 'u') {
81 // standard unicode escape sequence looks like \uxxxx but
82 // other browsers also accept less than 4 hex digits
83 unsigned short u = 0;
84 int j = 0;
85 for (j = 0; j < 4; ++j) {
86 if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
87 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
88 ++i;
89 } else {
90 // sequence incomplete. restore index.
91 // TODO: cleaner way to propagate warning
92 fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
93 i -= j;
94 break;
95 }
96 }
97 if (j < 4) {
98 // sequence was incomplete. treat \u as u which IE always
99 // and FF sometimes does.
100 np.append(UString('u'));
101 } else {
102 c = UChar(u);
103 switch (u) {
104 case 0:
105 // Make sure to encode 0, to avoid terminating the string
106 np += UString(nil);
107 break;
108 case '^':
109 case '$':
110 case '\\':
111 case '.':
112 case '*':
113 case '+':
114 case '?':
115 case '(': case ')':
116 case '{': case '}':
117 case '[': case ']':
118 case '|':
119 // escape pattern characters have to remain escaped
120 np.append(UString('\\'));
121 // intentional fallthrough
122 default:
123 np += UString(&c, 1);
124 break;
125 }
126 }
127 continue;
128 }
129 np += UString('\\');
130 np += UString(&c, 1);
131 } else {
132 if (c == '\\')
133 escape = true;
134 else if (c == '\0')
135 np += UString(nil);
136 else
137 np += UString(&c, 1);
138 }
139 }
140 }
141 // Rewrite very inefficient RE formulation:
142 // (.|\s)+ is often used instead of the less intuitive, but vastly preferable [\w\W]+
143 // The first wording needs to recurse at each character matched in libPCRE, leading to rapid exhaustion of stack space.
144 if (p.find(".|\\s)")>=0) {
145 if (np.isEmpty())
146 np = p;
147 bool didRewrite = false;
148 WTF::Vector<int> parenIdx;
149 sanitizePatternExtensions(np, &parenIdx);
150 Vector<int>::const_iterator end = parenIdx.end();
151 int previdx = 0;
152 UString tmp;
153 bool nonCapturing = false;
154 for (Vector<int>::const_iterator it = parenIdx.begin(); it != end; ++it) {
155 int idx = *it;
156 if (np.size() < idx+6)
157 break;
158 if (np[idx+1] == '?' && np[idx+2] == ':') {
159 nonCapturing = true;
160 idx += 3;
161 } else {
162 ++idx;
163 }
164 if (!(np[idx] == '.' && np[idx+1] == '|' && np[idx+2] == '\\' && np[idx+3] == 's'))
165 continue;
166 if (np.size() >= idx+6 && (np[idx+5] == '+' || (np[idx+5] == '*')) &&
167 // no need to do anything if the pattern is minimal e.g. (.|\s)+?
168 !(np.size() > idx+6 && np[idx+6] == '?')) {
169 didRewrite = true;
170 if (nonCapturing) { // trivial case: (?:.|\s)+ => [\w\W]+
171 tmp.append(np, previdx, idx-previdx-3);
172 tmp.append("[\\w\\W]");
173 tmp.append(np[idx+5]);
174 } else if (np[idx+5] == '*') { // capture zero of one or more: (.|\s)* => (?:[\w\W]*([\w\W])|[\w\W]?)
175 tmp.append(np, previdx, idx-previdx-1);
176 tmp.append("(?:[\\w\\W]*([\\w\\W])|[\\w\\W]?)");
177 } else { // capture last of one or more: (.|\s)+ => [\w\W]*([\w\W])
178 assert(np[idx+5] == '+');
179 tmp.append(np, previdx, idx-previdx-1);
180 tmp.append("[\\w\\W]*([\\w\\W])");
181 }
182 } else {
183 tmp.append(np, previdx, idx-previdx+5);
184 }
185 previdx = idx+6;
186 }
187 if (didRewrite) {
188 tmp.append(np, previdx);
189 fprintf(stderr, "Pattern: %s ", np.ascii());
190 fprintf(stderr, "was rewritten to: %s\n", tmp.ascii());
191 np = tmp;
192 changed = true;
193 }
194 }
195 return (changed ? np : p);
196}
197
198// For now, the only 'extension' to standard we are willing to deal with is
199// a non-escaped closing bracket, outside of a character class. e.g. /.*]/
200static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx)
201{
202 UString newPattern;
203
204 static const int StateNominal = 0, StateOpenBracket = 1;
205 WTF::Vector<int> v;
206 bool escape = false;
207
208 int state = StateNominal;
209 int escapedSinceLastParen = 0;
210 for (int i = 0; i < p.size(); ++i) {
211 UChar c = p[i];
212 if (escape) {
213 escape = false;
214 } else {
215 if (c == '\\') {
216 escape = true;
217 } else if (c == ']') {
218 if (state == StateOpenBracket) {
219 state = StateNominal;
220 } else if (state == StateNominal) {
221 v.append(i);
222 ++escapedSinceLastParen;
223 }
224 } else if (c == '[') {
225 if (state == StateOpenBracket) {
226 v.append(i);
227 ++escapedSinceLastParen;
228 } else if (state == StateNominal) {
229 state = StateOpenBracket;
230 }
231 } else if (c == '(') {
232 if (parenIdx && state == StateNominal) {
233 parenIdx->append(i+escapedSinceLastParen);
234 escapedSinceLastParen = 0;
235 }
236 }
237 }
238 }
239 if (state == StateOpenBracket) {
240 // this is not recoverable.
241 return false;
242 }
243 if (v.size()) {
244 int pos=0;
245 Vector<int>::const_iterator end = v.end();
246 for (Vector<int>::const_iterator it = v.begin(); it != end; ++it) {
247 newPattern += p.substr(pos, *it-pos);
248 pos = *it;
249 newPattern += UString('\\');
250 }
251 newPattern += p.substr(pos);
252 p = newPattern;
253 return true;
254 } else {
255 return false;
256 }
257}
258
259bool RegExp::tryGrowingMaxStackSize = true;
260bool RegExp::didIncreaseMaxStackSize = false;
261
262#if HAVE(SYS_TIME_H)
263rlim_t RegExp::availableStackSize = 8*1024*1024;
264#else
265int RegExp::availableStackSize = 8*1024*1024;
266#endif
267
268RegExp::RegExp(const UString &p, char flags)
269 : _pat(p), _flags(flags), _valid(true), _numSubPatterns(0)
270{
271#ifdef HAVE_PCREPOSIX
272 // Determine whether libpcre has unicode support if need be..
273 if (utf8Support == Unknown) {
274 int supported;
275 pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
276 utf8Support = supported ? Supported : Unsupported;
277 }
278#endif
279
280 UString intern = sanitizePattern(p);
281
282#ifdef HAVE_PCREPOSIX
283 int options = 0;
284
285 // we are close but not 100% the same as Perl
286#ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7
287 options |= PCRE_JAVASCRIPT_COMPAT;
288#endif
289
290 // Note: the Global flag is already handled by RegExpProtoFunc::execute.
291 // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
292 if (flags & IgnoreCase)
293 options |= PCRE_CASELESS;
294 if (flags & Multiline)
295 options |= PCRE_MULTILINE;
296
297 if (utf8Support == Supported)
298 options |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
299
300 const char *errorMessage;
301 int errorOffset;
302 bool secondTry = false;
303
304 while (1) {
305 RegExpStringContext converted(intern);
306
307 _regex = pcre_compile(converted.buffer(), options, &errorMessage, &errorOffset, NULL);
308
309 if (!_regex) {
310#ifdef PCRE_JAVASCRIPT_COMPAT
311 // The compilation failed. It is likely the pattern contains non-standard extensions.
312 // We may try to tolerate some of those extensions.
313 bool doRecompile = !secondTry && sanitizePatternExtensions(intern);
314 if (doRecompile) {
315 secondTry = true;
316#ifndef NDEBUG
317 fprintf(stderr, "KJS: pcre_compile() failed with '%s' - non-standard extensions detected in pattern, trying second compile after correction.\n", errorMessage);
318#endif
319 continue;
320 }
321#endif
322#ifndef NDEBUG
323 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
324#endif
325 _valid = false;
326 return;
327 }
328 break;
329 }
330
331#ifdef PCRE_INFO_CAPTURECOUNT
332 // Get number of subpatterns that will be returned.
333 pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
334#endif
335
336#else /* HAVE_PCREPOSIX */
337
338 int regflags = 0;
339#ifdef REG_EXTENDED
340 regflags |= REG_EXTENDED;
341#endif
342#ifdef REG_ICASE
343 if ( flags & IgnoreCase )
344 regflags |= REG_ICASE;
345#endif
346
347 //NOTE: Multiline is not feasible with POSIX regex.
348 //if ( f & Multiline )
349 // ;
350 // Note: the Global flag is already handled by RegExpProtoFunc::execute
351
352 int errorCode = regcomp(&_regex, intern.ascii(), regflags);
353 if (errorCode != 0) {
354#ifndef NDEBUG
355 char errorMessage[80];
356 regerror(errorCode, &_regex, errorMessage, sizeof errorMessage);
357 fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
358#endif
359 _valid = false;
360 }
361#endif
362}
363
364RegExp::~RegExp()
365{
366#ifdef HAVE_PCREPOSIX
367 pcre_free(_regex);
368#else
369 /* TODO: is this really okay after an error ? */
370 regfree(&_regex);
371#endif
372}
373
374void RegExpStringContext::prepareUtf8(const UString& s)
375{
376 // Allocate a buffer big enough to hold all the characters plus \0
377 const int length = s.size();
378 _buffer = new char[length * 3 + 1];
379
380 // Also create buffer for positions. We need one extra character in there,
381 // even past the \0 since the non-empty handling may jump one past the end
382 _originalPos = new int[length * 3 + 2];
383
384 // Convert to runs of 8-bit characters, and generate indices
385 // Note that we do NOT combine surrogate pairs here, as
386 // regexps operate on them as separate characters
387 char *p = _buffer;
388 int *posOut = _originalPos;
389 const UChar *d = s.data();
390 for (int i = 0; i != length; ++i) {
391 unsigned short c = d[i].unicode();
392
393 int sequenceLen;
394 if (c < 0x80) {
395 *p++ = (char)c;
396 sequenceLen = 1;
397 } else if (c < 0x800) {
398 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
399 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
400 sequenceLen = 2;
401 } else {
402 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
403 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
404 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
405 sequenceLen = 3;
406 }
407
408 while (sequenceLen > 0) {
409 *posOut = i;
410 ++posOut;
411 --sequenceLen;
412 }
413 }
414
415 _bufferSize = p - _buffer;
416
417 *p++ = '\0';
418
419 // Record positions for \0, and the fictional character after that.
420 *posOut = length;
421 *(posOut+1) = length+1;
422}
423
424void RegExpStringContext::prepareASCII (const UString& s)
425{
426 _originalPos = 0;
427
428 // Best-effort attempt to get something done
429 // when we don't have utf 8 available -- use
430 // truncated version, and pray for the best
431 CString truncated = s.cstring();
432 _buffer = new char[truncated.size() + 1];
433 memcpy(_buffer, truncated.c_str(), truncated.size());
434 _buffer[truncated.size()] = '\0'; // For _compile use
435 _bufferSize = truncated.size();
436}
437
438RegExpStringContext::RegExpStringContext(const UString &s)
439{
440#ifndef NDEBUG
441 _originalS = s;
442#endif
443
444 if (RegExp::utf8Support == RegExp::Supported)
445 prepareUtf8(s);
446 else
447 prepareASCII(s);
448}
449
450RegExpStringContext::~RegExpStringContext()
451{
452 delete[] _originalPos; _originalPos = 0;
453 delete[] _buffer; _buffer = 0;
454}
455
456UString RegExp::match(const RegExpStringContext& ctx, const UString &s, bool *error, int i, int *pos, int **ovector)
457{
458#ifndef NDEBUG
459 assert(s.data() == ctx._originalS.data()); // Make sure the context is right..
460#endif
461
462 if (i < 0)
463 i = 0;
464 int dummyPos;
465 if (!pos)
466 pos = &dummyPos;
467 *pos = -1;
468 if (ovector)
469 *ovector = 0;
470
471 if (i > s.size() || s.isNull())
472 return UString::null();
473
474#ifdef HAVE_PCREPOSIX
475
476 if (!_regex)
477 return UString::null();
478
479 // Set up the offset vector for the result.
480 // First 2/3 used for result, the last third used by PCRE.
481 int *offsetVector;
482 int offsetVectorSize;
483 int fixedSizeOffsetVector[3];
484 if (!ovector) {
485 offsetVectorSize = 3;
486 offsetVector = fixedSizeOffsetVector;
487 } else {
488 offsetVectorSize = (_numSubPatterns + 1) * 3;
489 offsetVector = new int [offsetVectorSize];
490 }
491
492 int startPos;
493 if (utf8Support == Supported) {
494 startPos = i;
495 while (ctx.originalPos(startPos) < i)
496 ++startPos;
497 } else {
498 startPos = i;
499 }
500
501 int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : 0;
502
503 // See if we have to limit stack space...
504 *error = false;
505 int stackGlutton = 0;
506 pcre_config(PCRE_CONFIG_STACKRECURSE, (void*)&stackGlutton);
507 pcre_extra limits;
508 if (stackGlutton) {
509#if HAVE(SYS_TIME_H)
510 if (tryGrowingMaxStackSize) {
511 rlimit l;
512 getrlimit(RLIMIT_STACK, &l);
513 availableStackSize = l.rlim_cur;
514 if (l.rlim_cur < sWantedStackSizeLimit &&
515 (l.rlim_max > l.rlim_cur || l.rlim_max == RLIM_INFINITY)) {
516 l.rlim_cur = (l.rlim_max == RLIM_INFINITY) ?
517 sWantedStackSizeLimit : std::min(l.rlim_max, sWantedStackSizeLimit);
518 if ((didIncreaseMaxStackSize = !setrlimit( RLIMIT_STACK, &l)))
519 availableStackSize = l.rlim_cur;
520 }
521 tryGrowingMaxStackSize = false;
522 }
523#endif
524
525 limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION;
526 // libPCRE docs claim that it munches about 500 bytes per recursion.
527 // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes
528 // (and I've measured 800 in an another instance)
529 // We go somewhat conservative, and use about 3/4ths of that,
530 // especially since we're not exactly light on the stack, either
531 limits.match_limit_recursion = (availableStackSize/1300)*3/4;
532 }
533
534 const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : 0, ctx.buffer(),
535 ctx.bufferSize(), startPos, baseFlags, offsetVector, offsetVectorSize);
536
537 //Now go through and patch up the offsetVector
538 if (utf8Support == Supported)
539 for (int c = 0; c < 2 * numMatches; ++c)
540 if (offsetVector[c] != -1)
541 offsetVector[c] = ctx.originalPos(offsetVector[c]);
542
543 if (numMatches < 0) {
544#ifndef NDEBUG
545 if (numMatches != PCRE_ERROR_NOMATCH)
546 fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
547#endif
548 if (offsetVector != fixedSizeOffsetVector)
549 delete [] offsetVector;
550 if (numMatches == PCRE_ERROR_MATCHLIMIT || numMatches == PCRE_ERROR_RECURSIONLIMIT)
551 *error = true;
552 return UString::null();
553 }
554
555 *pos = offsetVector[0];
556 if (ovector)
557 *ovector = offsetVector;
558 return s.substr(offsetVector[0], offsetVector[1] - offsetVector[0]);
559
560#else
561
562 if (!_valid)
563 return UString::null();
564
565 const unsigned maxMatch = 10;
566 regmatch_t rmatch[maxMatch];
567
568 char *str = strdup(s.ascii()); // TODO: why ???
569 if (regexec(&_regex, str + i, maxMatch, rmatch, 0)) {
570 free(str);
571 return UString::null();
572 }
573 free(str);
574
575 if (!ovector) {
576 *pos = rmatch[0].rm_so + i;
577 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
578 }
579
580 // map rmatch array to ovector used in PCRE case
581 _numSubPatterns = 0;
582 for(unsigned j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
583 _numSubPatterns++;
584 int ovecsize = (_numSubPatterns+1)*3; // see above
585 *ovector = new int[ovecsize];
586 for (unsigned j = 0; j < _numSubPatterns + 1; j++) {
587 if (j>maxMatch)
588 break;
589 (*ovector)[2*j] = rmatch[j].rm_so + i;
590 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
591 }
592
593 *pos = (*ovector)[0];
594 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
595
596#endif
597}
598
599} // namespace KJS
600