regexp.cpp [kdelibs/kjs/regexp.cpp]

1	// -- c-basic-offset: 2 --
2	/*
3	* This file is part of the KDE libraries
4	* Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
5	* Copyright (C) 2003,2004 Apple Computer, Inc.
6	* Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
7	* Copyright (C) 2007 Sune Vuorela (debian@pusling.com)
8	*
9	* This library is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public
11	* License as published by the Free Software Foundation; either
12	* version 2 of the License, or (at your option) any later version.
13	*
14	* This library is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with this library; if not, write to the Free Software
21	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22	*
23	*/
24
25	#include "regexp.h"
26	#include <config-kjs.h>
27	#include "lexer.h"
28
29	#include <assert.h>
30	#include <stdio.h>
31	#include <stdlib.h>
32	#include <string.h>
33	#include <wtf/Vector.h>
34
35	#if defined _WIN32 \|\| defined _WIN64
36	#undef HAVE_SYS_TIME_H
37	#endif
38	#if HAVE(SYS_TIME_H)
39	#include <sys/time.h>
40	#include <sys/resource.h>
41
42	static const rlim_t sWantedStackSizeLimit = `32``1024``1024`;
43
44	#endif
45
46	using WTF::Vector;
47
48	// GCC cstring uses these automatically, but not all implementations do.
49	using std::strlen;
50	using std::strcpy;
51	using std::strncpy;
52	using std::memset;
53	using std::memcpy;
54
55	namespace KJS {
56
57	RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
58
59	static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx = `0`);
60
61	// JS regexps can contain Unicode escape sequences (\uxxxx) which
62	// are rather uncommon elsewhere. As our regexp libs don't understand
63	// them we do the unescaping ourselves internally.
64	// Also make sure to expand out any nulls as pcre_compile
65	// expects null termination..
66	static UString sanitizePattern(const UString &p)
67	{
68	UString np;
69
70	bool changed = false;
71	const char* const nil = "\\x00";
72	if (p.find("\\u") >= `0` \|\| p.find(KJS::UChar ('\0')) >= `0`) {
73	bool escape = false;
74	changed = true;
75	for (int i = `0`; i < p.size(); ++i) {
76	UChar c = p [i];
77	if (escape) {
78	escape = false;
79	// we only care about \u
80	if (c == 'u') {
81	// standard unicode escape sequence looks like \uxxxx but
82	// other browsers also accept less than 4 hex digits
83	unsigned short u = `0`;
84	int j = `0`;
85	for (j = `0`; j < `4`; ++j) {
86	if (i + `1` < p.size() && Lexer::isHexDigit(p [i + `1`].unicode())) {
87	u = (u << `4`) + Lexer::convertHex(p [i + `1`].unicode());
88	++i;
89	} else {
90	// sequence incomplete. restore index.
91	// TODO: cleaner way to propagate warning
92	fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
93	i -= j;
94	break;
95	}
96	}
97	if (j < `4`) {
98	// sequence was incomplete. treat \u as u which IE always
99	// and FF sometimes does.
100	np.append(UString('u'));
101	} else {
102	c = UChar (u);
103	switch (u) {
104	case `0`:
105	// Make sure to encode 0, to avoid terminating the string
106	np += UString(nil);
107	break;
108	case '^':
109	case '$':
110	case '\\':
111	case '.':
112	case '*':
113	case '+':
114	case '?':
115	case '(': case ')':
116	case '{': case '}':
117	case '[': case ']':
118	case '\|':
119	// escape pattern characters have to remain escaped
120	np.append(UString('\\'));
121	// intentional fallthrough
122	default:
123	np += UString (&c, `1`);
124	break;
125	}
126	}
127	continue;
128	}
129	np += UString('\\');
130	np += UString (&c, `1`);
131	} else {
132	if (c == '\\')
133	escape = true;
134	else if (c == '\0')
135	np += UString(nil);
136	else
137	np += UString (&c, `1`);
138	}
139	}
140	}
141	// Rewrite very inefficient RE formulation:
142	// (.\|\s)+ is often used instead of the less intuitive, but vastly preferable [\w\W]+
143	// The first wording needs to recurse at each character matched in libPCRE, leading to rapid exhaustion of stack space.
144	if (p.find(".\|\\s)")>=`0`) {
145	if (np.isEmpty())
146	np = p;
147	bool didRewrite = false;
148	WTF::Vector<int> parenIdx;
149	sanitizePatternExtensions(np, &parenIdx);
150	Vector<int>::const_iterator end = parenIdx.end();
151	int previdx = `0`;
152	UString tmp;
153	bool nonCapturing = false;
154	for (Vector<int>::const_iterator it = parenIdx.begin(); it != end; ++it) {
155	int idx = *it;
156	if (np.size() < idx+`6`)
157	break;
158	if (np [idx+`1`] == '?' && np [idx+`2`] == ':') {
159	nonCapturing = true;
160	idx += `3`;
161	} else {
162	++idx;
163	}
164	if (!(np [idx] == '.' && np [idx+`1`] == '\|' && np [idx+`2`] == '\\' && np [idx+`3`] == 's'))
165	continue;
166	if (np.size() >= idx+`6` && (np [idx+`5`] == '+' \|\| (np [idx+`5`] == '*')) &&
167	// no need to do anything if the pattern is minimal e.g. (.\|\s)+?
168	!(np.size() > idx+`6` && np [idx+`6`] == '?')) {
169	didRewrite = true;
170	if (nonCapturing) { // trivial case: (?:.\|\s)+ => [\w\W]+
171	tmp.append(np, previdx, idx-previdx-`3`);
172	tmp.append("[\\w\\W]");
173	tmp.append(np [idx+`5`]);
174	} else if (np [idx+`5`] == '') { // capture zero of one or more: (.\|\s)* => (?:[\w\W]([\w\W])\|[\w\W]?)
175	tmp.append(np, previdx, idx-previdx-`1`);
176	tmp.append("(?:[\\w\\W]*([\\w\\W])\|[\\w\\W]?)");
177	} else { // capture last of one or more: (.\|\s)+ => [\w\W]([\w\W])*
178	assert(np[idx+`5`] == '+');
179	tmp.append(np, previdx, idx-previdx-`1`);
180	tmp.append("[\\w\\W]*([\\w\\W])");
181	}
182	} else {
183	tmp.append(np, previdx, idx-previdx+`5`);
184	}
185	previdx = idx+`6`;
186	}
187	if (didRewrite) {
188	tmp.append(np, previdx);
189	fprintf(stderr, "Pattern: %s ", np.ascii());
190	fprintf(stderr, "was rewritten to: %s\n", tmp.ascii());
191	np = tmp;
192	changed = true;
193	}
194	}
195	return (changed ? np : p);
196	}
197
198	// For now, the only 'extension' to standard we are willing to deal with is
199	// a non-escaped closing bracket, outside of a character class. e.g. /.]/*
200	static bool sanitizePatternExtensions(UString &p, WTF::Vector<int>* parenIdx)
201	{
202	UString newPattern;
203
204	static const int StateNominal = `0`, StateOpenBracket = `1`;
205	WTF::Vector<int> v;
206	bool escape = false;
207
208	int state = StateNominal;
209	int escapedSinceLastParen = `0`;
210	for (int i = `0`; i < p.size(); ++i) {
211	UChar c = p [i];
212	if (escape) {
213	escape = false;
214	} else {
215	if (c == '\\') {
216	escape = true;
217	} else if (c == ']') {
218	if (state == StateOpenBracket) {
219	state = StateNominal;
220	} else if (state == StateNominal) {
221	v.append(i);
222	++escapedSinceLastParen;
223	}
224	} else if (c == '[') {
225	if (state == StateOpenBracket) {
226	v.append(i);
227	++escapedSinceLastParen;
228	} else if (state == StateNominal) {
229	state = StateOpenBracket;
230	}
231	} else if (c == '(') {
232	if (parenIdx && state == StateNominal) {
233	parenIdx->append(i+escapedSinceLastParen);
234	escapedSinceLastParen = `0`;
235	}
236	}
237	}
238	}
239	if (state == StateOpenBracket) {
240	// this is not recoverable.
241	return false;
242	}
243	if (v.size()) {
244	int pos=`0`;
245	Vector<int>::const_iterator end = v.end();
246	for (Vector<int>::const_iterator it = v.begin(); it != end; ++it) {
247	newPattern += p.substr(pos, *it-pos);
248	pos = *it;
249	newPattern += UString('\\');
250	}
251	newPattern += p.substr(pos);
252	p = newPattern;
253	return true;
254	} else {
255	return false;
256	}
257	}
258
259	bool RegExp::tryGrowingMaxStackSize = true;
260	bool RegExp::didIncreaseMaxStackSize = false;
261
262	#if HAVE(SYS_TIME_H)
263	rlim_t RegExp::availableStackSize = `8``1024``1024`;
264	#else
265	int RegExp::availableStackSize = `8``1024``1024`;
266	#endif
267
268	RegExp::RegExp(const UString &p, char flags)
269	: _pat (p), _flags(flags), _valid(true), _numSubPatterns(`0`)
270	{
271	#ifdef HAVE_PCREPOSIX
272	// Determine whether libpcre has unicode support if need be..
273	if (utf8Support == Unknown) {
274	int supported;
275	pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
276	utf8Support = supported ? Supported : Unsupported;
277	}
278	#endif
279
280	UString intern = sanitizePattern(p);
281
282	#ifdef HAVE_PCREPOSIX
283	int options = `0`;
284
285	// we are close but not 100% the same as Perl
286	#ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7
287	options \|= PCRE_JAVASCRIPT_COMPAT;
288	#endif
289
290	// Note: the Global flag is already handled by RegExpProtoFunc::execute.
291	// FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
292	if (flags & IgnoreCase)
293	options \|= PCRE_CASELESS;
294	if (flags & Multiline)
295	options \|= PCRE_MULTILINE;
296
297	if (utf8Support == Supported)
298	options \|= (PCRE_UTF8 \| PCRE_NO_UTF8_CHECK);
299
300	const char *errorMessage;
301	int errorOffset;
302	bool secondTry = false;
303
304	while (`1`) {
305	RegExpStringContext converted(intern);
306
307	_regex = pcre_compile(converted.buffer(), options, &errorMessage, &errorOffset, NULL);
308
309	if (!_regex) {
310	#ifdef PCRE_JAVASCRIPT_COMPAT
311	// The compilation failed. It is likely the pattern contains non-standard extensions.
312	// We may try to tolerate some of those extensions.
313	bool doRecompile = !secondTry && sanitizePatternExtensions(intern);
314	if (doRecompile) {
315	secondTry = true;
316	#ifndef NDEBUG
317	fprintf(stderr, "KJS: pcre_compile() failed with '%s' - non-standard extensions detected in pattern, trying second compile after correction.\n", errorMessage);
318	#endif
319	continue;
320	}
321	#endif
322	#ifndef NDEBUG
323	fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMessage);
324	#endif
325	_valid = false;
326	return;
327	}
328	break;
329	}
330
331	#ifdef PCRE_INFO_CAPTURECOUNT
332	// Get number of subpatterns that will be returned.
333	pcre_fullinfo(_regex, NULL, PCRE_INFO_CAPTURECOUNT, &_numSubPatterns);
334	#endif
335
336	#else /* HAVE_PCREPOSIX */
337
338	int regflags = `0`;
339	#ifdef REG_EXTENDED
340	regflags \|= REG_EXTENDED;
341	#endif
342	#ifdef REG_ICASE
343	if ( flags & IgnoreCase )
344	regflags \|= REG_ICASE;
345	#endif
346
347	//NOTE: Multiline is not feasible with POSIX regex.
348	//if ( f & Multiline )
349	// ;
350	// Note: the Global flag is already handled by RegExpProtoFunc::execute
351
352	int errorCode = regcomp(&_regex, intern.ascii(), regflags);
353	if (errorCode != `0`) {
354	#ifndef NDEBUG
355	char errorMessage[`80`];
356	regerror(errorCode, &_regex, errorMessage, sizeof errorMessage);
357	fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
358	#endif
359	_valid = false;
360	}
361	#endif
362	}
363
364	RegExp::~RegExp()
365	{
366	#ifdef HAVE_PCREPOSIX
367	pcre_free(_regex);
368	#else
369	/ TODO: is this really okay after an error ? /
370	regfree(&_regex);
371	#endif
372	}
373
374	void RegExpStringContext::prepareUtf8(const UString& s)
375	{
376	// Allocate a buffer big enough to hold all the characters plus \0
377	const int length = s.size();
378	_buffer = new char[length * `3` + `1`];
379
380	// Also create buffer for positions. We need one extra character in there,
381	// even past the \0 since the non-empty handling may jump one past the end
382	_originalPos = new int[length * `3` + `2`];
383
384	// Convert to runs of 8-bit characters, and generate indices
385	// Note that we do NOT combine surrogate pairs here, as
386	// regexps operate on them as separate characters
387	char *p = _buffer;
388	int *posOut = _originalPos;
389	const UChar *d = s.data();
390	for (int i = `0`; i != length; ++i) {
391	unsigned short c = d[i].unicode();
392
393	int sequenceLen;
394	if (c < `0x80`) {
395	p++ = (char*)c;
396	sequenceLen = `1`;
397	} else if (c < `0x800`) {
398	p++ = (char)((c >> `6`) \| `0xC0`); // C0 is the 2-byte flag for UTF-8*
399	p++ = (char)((c \| `0x80`) & `0xBF`); // next 6 bits, with high bit set*
400	sequenceLen = `2`;
401	} else {
402	p++ = (char)((c >> `12`) \| `0xE0`); // E0 is the 3-byte flag for UTF-8*
403	p++ = (char)(((c >> `6`) \| `0x80`) & `0xBF`); // next 6 bits, with high bit set*
404	p++ = (char)((c \| `0x80`) & `0xBF`); // next 6 bits, with high bit set*
405	sequenceLen = `3`;
406	}
407
408	while (sequenceLen > `0`) {
409	*posOut = i;
410	++posOut;
411	--sequenceLen;
412	}
413	}
414
415	_bufferSize = p - _buffer;
416
417	*p++ = '\0';
418
419	// Record positions for \0, and the fictional character after that.
420	*posOut = length;
421	*(posOut+`1`) = length+`1`;
422	}
423
424	void RegExpStringContext::prepareASCII (const UString& s)
425	{
426	_originalPos = `0`;
427
428	// Best-effort attempt to get something done
429	// when we don't have utf 8 available -- use
430	// truncated version, and pray for the best
431	CString truncated = s.cstring();
432	_buffer = new char[truncated.size() + `1`];
433	memcpy(_buffer, truncated.c_str(), truncated.size());
434	_buffer[truncated.size()] = '\0'; // For _compile use
435	_bufferSize = truncated.size();
436	}
437
438	RegExpStringContext::RegExpStringContext(const UString &s)
439	{
440	#ifndef NDEBUG
441	_originalS = s;
442	#endif
443
444	if (RegExp::utf8Support == RegExp::Supported)
445	prepareUtf8(s);
446	else
447	prepareASCII(s);
448	}
449
450	RegExpStringContext::~RegExpStringContext()
451	{
452	delete[] _originalPos; _originalPos = `0`;
453	delete[] _buffer; _buffer = `0`;
454	}
455
456	UString RegExp::match(const RegExpStringContext& ctx, const UString &s, bool error, int* i, int pos, int* **ovector)
457	{
458	#ifndef NDEBUG
459	assert(s.data() == ctx._originalS.data()); // Make sure the context is right..
460	#endif
461
462	if (i < `0`)
463	i = `0`;
464	int dummyPos;
465	if (!pos)
466	pos = &dummyPos;
467	*pos = -`1`;
468	if (ovector)
469	*ovector = `0`;
470
471	if (i > s.size() \|\| s.isNull())
472	return UString::null();
473
474	#ifdef HAVE_PCREPOSIX
475
476	if (!_regex)
477	return UString::null();
478
479	// Set up the offset vector for the result.
480	// First 2/3 used for result, the last third used by PCRE.
481	int *offsetVector;
482	int offsetVectorSize;
483	int fixedSizeOffsetVector[`3`];
484	if (!ovector) {
485	offsetVectorSize = `3`;
486	offsetVector = fixedSizeOffsetVector;
487	} else {
488	offsetVectorSize = (_numSubPatterns + `1`) * `3`;
489	offsetVector = new int [offsetVectorSize];
490	}
491
492	int startPos;
493	if (utf8Support == Supported) {
494	startPos = i;
495	while (ctx.originalPos(startPos) < i)
496	++startPos;
497	} else {
498	startPos = i;
499	}
500
501	int baseFlags = utf8Support == Supported ? PCRE_NO_UTF8_CHECK : `0`;
502
503	// See if we have to limit stack space...
504	error = false*;
505	int stackGlutton = `0`;
506	pcre_config(PCRE_CONFIG_STACKRECURSE, (void*)&stackGlutton);
507	pcre_extra limits;
508	if (stackGlutton) {
509	#if HAVE(SYS_TIME_H)
510	if (tryGrowingMaxStackSize) {
511	rlimit l;
512	getrlimit(RLIMIT_STACK, &l);
513	availableStackSize = l.rlim_cur;
514	if (l.rlim_cur < sWantedStackSizeLimit &&
515	(l.rlim_max > l.rlim_cur \|\| l.rlim_max == RLIM_INFINITY)) {
516	l.rlim_cur = (l.rlim_max == RLIM_INFINITY) ?
517	sWantedStackSizeLimit : std::min(l.rlim_max, sWantedStackSizeLimit);
518	if ((didIncreaseMaxStackSize = !setrlimit( RLIMIT_STACK, &l)))
519	availableStackSize = l.rlim_cur;
520	}
521	tryGrowingMaxStackSize = false;
522	}
523	#endif
524
525	limits.flags = PCRE_EXTRA_MATCH_LIMIT_RECURSION;
526	// libPCRE docs claim that it munches about 500 bytes per recursion.
527	// The crash in #160792 actually showed pcre 7.4 using about 1300 bytes
528	// (and I've measured 800 in an another instance)
529	// We go somewhat conservative, and use about 3/4ths of that,
530	// especially since we're not exactly light on the stack, either
531	limits.match_limit_recursion = (availableStackSize/`1300`)*`3`/`4`;
532	}
533
534	const int numMatches = pcre_exec(_regex, stackGlutton ? &limits : `0`, ctx.buffer(),
535	ctx.bufferSize(), startPos, baseFlags, offsetVector, offsetVectorSize);
536
537	//Now go through and patch up the offsetVector
538	if (utf8Support == Supported)
539	for (int c = `0`; c < `2` * numMatches; ++c)
540	if (offsetVector[c] != -`1`)
541	offsetVector[c] = ctx.originalPos(offsetVector[c]);
542
543	if (numMatches < `0`) {
544	#ifndef NDEBUG
545	if (numMatches != PCRE_ERROR_NOMATCH)
546	fprintf(stderr, "KJS: pcre_exec() failed with result %d\n", numMatches);
547	#endif
548	if (offsetVector != fixedSizeOffsetVector)
549	delete [] offsetVector;
550	if (numMatches == PCRE_ERROR_MATCHLIMIT \|\| numMatches == PCRE_ERROR_RECURSIONLIMIT)
551	error = true*;
552	return UString::null();
553	}
554
555	*pos = offsetVector[`0`];
556	if (ovector)
557	*ovector = offsetVector;
558	return s.substr(offsetVector[`0`], offsetVector[`1`] - offsetVector[`0`]);
559
560	#else
561
562	if (!_valid)
563	return UString::null();
564
565	const unsigned maxMatch = `10`;
566	regmatch_t rmatch[maxMatch];
567
568	char str = strdup(s.ascii()); // TODO: why ???*
569	if (regexec(&_regex, str + i, maxMatch, rmatch, `0`)) {
570	free(str);
571	return UString::null();
572	}
573	free(str);
574
575	if (!ovector) {
576	*pos = rmatch[`0`].rm_so + i;
577	return s.substr(rmatch[`0`].rm_so + i, rmatch[`0`].rm_eo - rmatch[`0`].rm_so);
578	}
579
580	// map rmatch array to ovector used in PCRE case
581	_numSubPatterns = `0`;
582	for(unsigned j = `1`; j < maxMatch && rmatch[j].rm_so >= `0` ; j++)
583	_numSubPatterns++;
584	int ovecsize = (_numSubPatterns+`1`)`3`; // see above*
585	ovector = new* int[ovecsize];
586	for (unsigned j = `0`; j < _numSubPatterns + `1`; j++) {
587	if (j>maxMatch)
588	break;
589	(ovector)[`2`j] = rmatch[j].rm_so + i;
590	(ovector)[`2`j+`1`] = rmatch[j].rm_eo + i;
591	}
592
593	pos = (ovector)[`0`];
594	return s.substr((ovector)[`0`], (ovector)[`1`] - (*ovector)[`0`]);
595
596	#endif
597	}
598
599	} // namespace KJS
600