lexer.cpp [kdelibs/kjs/lexer.cpp]

1	// -- c-basic-offset: 2 --
2	/*
3	* This file is part of the KDE libraries
4	* Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
5	* Copyright (C) 2006 Apple Computer, Inc.
6	* Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
7	*
8	* This library is free software; you can redistribute it and/or
9	* modify it under the terms of the GNU Library General Public
10	* License as published by the Free Software Foundation; either
11	* version 2 of the License, or (at your option) any later version.
12	*
13	* This library is distributed in the hope that it will be useful,
14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16	* Library General Public License for more details.
17	*
18	* You should have received a copy of the GNU Library General Public License
19	* along with this library; see the file COPYING.LIB. If not, write to
20	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21	* Boston, MA 02110-1301, USA.
22	*
23	*/
24
25	#include "lexer.h"
26	#include <config-kjs.h>
27	#include <string.h>
28	#include <limits.h>
29
30	#include "dtoa.h"
31	#include "function.h"
32	#include "interpreter.h"
33	#include "nodes.h"
34	#include "commonunicode.h"
35	#include "wtf/ASCIICType.h"
36	#include "wtf/DisallowCType.h"
37	#include <wtf/unicode/libc/UnicodeLibC.h>
38
39	using namespace WTF;
40	using namespace Unicode;
41
42	// GCC cstring uses these automatically, but not all implementations do.
43	using std::strlen;
44	using std::strcpy;
45	using std::strncpy;
46	using std::memset;
47	using std::memcpy;
48
49	// we can't specify the namespace in yacc's C output, so do it here
50	using namespace KJS;
51
52	#ifndef KDE_USE_FINAL
53	#include "grammar.h"
54	#endif
55
56	#include "lookup.h"
57	#include "lexer.lut.h"
58
59	extern YYLTYPE kjsyylloc; // global bison variable holding token info
60
61	// a bridge for yacc from the C world to C++
62	int kjsyylex()
63	{
64	return lexer().lex();
65	}
66
67	namespace KJS {
68
69	static bool isDecimalDigit(int c);
70
71	static const size_t initialReadBufferCapacity = `32`;
72	static const size_t initialStringTableCapacity = `64`;
73
74	Lexer& lexer()
75	{
76	// ASSERT(JSLock::currentThreadIsHoldingLock());
77
78	// FIXME: We'd like to avoid calling new here, but we don't currently
79	// support tearing down the Lexer at app quit time, since that would involve
80	// tearing down its UString data members without holding the JSLock.
81	static Lexer* staticLexer = new Lexer;
82	return *staticLexer;
83	}
84
85	Lexer::Lexer()
86	: yylineno(`0`)
87	, restrKeyword(false)
88	, eatNextIdentifier(false)
89	, stackToken(-`1`)
90	, lastToken(-`1`)
91	, pos(`0`)
92	, code(`0`)
93	, length(`0`)
94	#ifndef KJS_PURE_ECMA
95	, bol(true)
96	#endif
97	, current(`0`)
98	, next1(`0`)
99	, next2(`0`)
100	, next3(`0`)
101	{
102	m_buffer8.reserveCapacity(initialReadBufferCapacity);
103	m_buffer16.reserveCapacity(initialReadBufferCapacity);
104	m_strings.reserveCapacity(initialStringTableCapacity);
105	m_identifiers.reserveCapacity(initialStringTableCapacity);
106	}
107
108	void Lexer::setCode(const UString &sourceURL, int startingLineNumber, const KJS::UChar c, unsigned* int len)
109	{
110	yylineno = startingLineNumber;
111	m_sourceURL = sourceURL;
112	restrKeyword = false;
113	delimited = false;
114	eatNextIdentifier = false;
115	stackToken = -`1`;
116	lastToken = -`1`;
117	pos = `0`;
118	code = c;
119	length = len;
120	skipLF = false;
121	skipCR = false;
122	error = false;
123	#ifndef KJS_PURE_ECMA
124	bol = true;
125	#endif
126
127	// read first characters
128	current = (length > `0`) ? code[`0`].uc : -`1`;
129	next1 = (length > `1`) ? code[`1`].uc : -`1`;
130	next2 = (length > `2`) ? code[`2`].uc : -`1`;
131	next3 = (length > `3`) ? code[`3`].uc : -`1`;
132	}
133
134	void Lexer::shift(unsigned int p)
135	{
136	// Here would be a good place to strip Cf characters, but that has caused compatibility problems:
137	// <http://bugs.webkit.org/show_bug.cgi?id=10183>.
138	while (p--) {
139	current = next1;
140	next1 = next2;
141	next2 = next3;
142	pos++;
143	next3 = (pos + `3` < length) ? code[pos + `3`].uc : -`1`;
144	}
145	}
146
147	// called on each new line
148	void Lexer::nextLine()
149	{
150	yylineno++;
151	#ifndef KJS_PURE_ECMA
152	bol = true;
153	#endif
154	}
155
156	void Lexer::setDone(State s)
157	{
158	state = s;
159	done = true;
160	}
161
162	int Lexer::lex()
163	{
164	int token = `0`;
165	state = Start;
166	unsigned short stringType = `0`; // either single or double quotes
167	m_buffer8.clear();
168	m_buffer16.clear();
169	done = false;
170	terminator = false;
171	skipLF = false;
172	skipCR = false;
173
174	// did we push a token on the stack previously ?
175	// (after an automatic semicolon insertion)
176	if (stackToken >= `0`) {
177	setDone(Other);
178	token = stackToken;
179	stackToken = `0`;
180	}
181
182	while (!done) {
183	if (skipLF && current != '\n') // found \r but not \n afterwards
184	skipLF = false;
185	if (skipCR && current != '\r') // found \n but not \r afterwards
186	skipCR = false;
187	if (skipLF \|\| skipCR) // found \r\n or \n\r -> eat the second one
188	{
189	skipLF = false;
190	skipCR = false;
191	shift(`1`);
192	}
193	switch (state) {
194	case Start:
195	if (isWhiteSpace()) {
196	// do nothing
197	} else if (current == '/' && next1 == '/') {
198	shift(`1`);
199	state = InSingleLineComment;
200	} else if (current == '/' && next1 == '*') {
201	shift(`1`);
202	state = InMultiLineComment;
203	} else if (current == -`1`) {
204	if (!terminator && !delimited) {
205	// automatic semicolon insertion if program incomplete
206	token = ';';
207	stackToken = `0`;
208	setDone(Other);
209	} else
210	setDone(Eof);
211	} else if (isLineTerminator()) {
212	nextLine();
213	terminator = true;
214	if (restrKeyword) {
215	token = ';';
216	setDone(Other);
217	}
218	} else if (current == '"' \|\| current == '\'') {
219	state = InString;
220	stringType = static_cast<unsigned short>(current);
221	} else if (isIdentStart(current)) {
222	record16(current);
223	state = InIdentifierOrKeyword;
224	} else if (current == '\\') {
225	state = InIdentifierStartUnicodeEscapeStart;
226	} else if (current == '0') {
227	record8(current);
228	state = InNum0;
229	} else if (isDecimalDigit(current)) {
230	record8(current);
231	state = InNum;
232	} else if (current == '.' && isDecimalDigit(next1)) {
233	record8(current);
234	state = InDecimal;
235	#ifndef KJS_PURE_ECMA
236	// <!-- marks the beginning of a line comment (for www usage)
237	} else if (current == '<' && next1 == '!' &&
238	next2 == '-' && next3 == '-') {
239	shift(`3`);
240	state = InSingleLineComment;
241	// same for -->
242	} else if (bol && current == '-' && next1 == '-' && next2 == '>') {
243	shift(`2`);
244	state = InSingleLineComment;
245	#endif
246	} else {
247	token = matchPunctuator(current, next1, next2, next3);
248	if (token != -`1`) {
249	setDone(Other);
250	} else {
251	// cerr << "encountered unknown character" << endl;
252	setDone(Bad);
253	}
254	}
255	break;
256	case InString:
257	switch (current) {
258	case '\'':
259	case '"':
260	if (current == stringType) {
261	shift(`1`);
262	setDone(String);
263	} else {
264	record16(current);
265	}
266	break;
267	case '\\':
268	state = InEscapeSequence;
269	break;
270	case '\n':
271	case '\r':
272	case `0x2028`:
273	case `0x2029`:
274	case -`1`:
275	// encountered newline or eof
276	setDone(Bad);
277	break;
278	default:
279	record16(current);
280	break;
281	}
282	break;
283	// Escape Sequences inside of strings
284	case InEscapeSequence:
285	if (isOctalDigit(current)) {
286	if (current >= '0' && current <= '3' &&
287	isOctalDigit(next1) && isOctalDigit(next2)) {
288	record16(convertOctal(current, next1, next2));
289	shift(`2`);
290	state = InString;
291	} else if (isOctalDigit(current) && isOctalDigit(next1)) {
292	record16(convertOctal('0', current, next1));
293	shift(`1`);
294	state = InString;
295	} else if (isOctalDigit(current)) {
296	record16(convertOctal('0', '0', current));
297	state = InString;
298	} else {
299	setDone(Bad);
300	}
301	} else if (current == 'x')
302	state = InHexEscape;
303	else if (current == 'u')
304	state = InUnicodeEscape;
305	else if (isLineTerminator()) {
306	nextLine();
307	state = InString;
308	} else {
309	record16(singleEscape(static_cast<unsigned short>(current)));
310	state = InString;
311	}
312	break;
313	case InHexEscape:
314	if (isHexDigit(current) && isHexDigit(next1)) {
315	state = InString;
316	record16(convertHex(current, next1));
317	shift(`1`);
318	} else {
319	setDone(Bad);
320	}
321	break;
322	case InUnicodeEscape:
323	if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
324	record16(convertUnicode(current, next1, next2, next3));
325	shift(`3`);
326	state = InString;
327	} else if (current == stringType) {
328	record16('u');
329	shift(`1`);
330	setDone(String);
331	} else {
332	setDone(Bad);
333	}
334	break;
335	case InSingleLineComment:
336	if (isLineTerminator()) {
337	nextLine();
338	terminator = true;
339	if (restrKeyword) {
340	token = ';';
341	setDone(Other);
342	} else
343	state = Start;
344	} else if (current == -`1`) {
345	setDone(Eof);
346	}
347	break;
348	case InMultiLineComment:
349	if (current == -`1`) {
350	setDone(Bad);
351	} else if (isLineTerminator()) {
352	nextLine();
353	} else if (current == '*' && next1 == '/') {
354	state = Start;
355	shift(`1`);
356	}
357	break;
358	case InIdentifierOrKeyword:
359	case InIdentifier:
360	if (isIdentPart(current))
361	record16(current);
362	else if (current == '\\')
363	state = InIdentifierPartUnicodeEscapeStart;
364	else
365	setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
366	break;
367	case InNum0:
368	if (current == 'x' \|\| current == 'X') {
369	record8(current);
370	state = InHex;
371	} else if (current == '.') {
372	record8(current);
373	state = InDecimal;
374	} else if (current == 'e' \|\| current == 'E') {
375	record8(current);
376	state = InExponentIndicator;
377	} else if (isOctalDigit(current)) {
378	record8(current);
379	state = InOctal;
380	} else if (isDecimalDigit(current)) {
381	record8(current);
382	state = InDecimal;
383	} else {
384	setDone(Number);
385	}
386	break;
387	case InHex:
388	if (isHexDigit(current)) {
389	record8(current);
390	} else {
391	setDone(Hex);
392	}
393	break;
394	case InOctal:
395	if (isOctalDigit(current)) {
396	record8(current);
397	}
398	else if (isDecimalDigit(current)) {
399	record8(current);
400	state = InDecimal;
401	} else
402	setDone(Octal);
403	break;
404	case InNum:
405	if (isDecimalDigit(current)) {
406	record8(current);
407	} else if (current == '.') {
408	record8(current);
409	state = InDecimal;
410	} else if (current == 'e' \|\| current == 'E') {
411	record8(current);
412	state = InExponentIndicator;
413	} else
414	setDone(Number);
415	break;
416	case InDecimal:
417	if (isDecimalDigit(current)) {
418	record8(current);
419	} else if (current == 'e' \|\| current == 'E') {
420	record8(current);
421	state = InExponentIndicator;
422	} else
423	setDone(Number);
424	break;
425	case InExponentIndicator:
426	if (current == '+' \|\| current == '-') {
427	record8(current);
428	} else if (isDecimalDigit(current)) {
429	record8(current);
430	state = InExponent;
431	} else
432	setDone(Bad);
433	break;
434	case InExponent:
435	if (isDecimalDigit(current)) {
436	record8(current);
437	} else
438	setDone(Number);
439	break;
440	case InIdentifierStartUnicodeEscapeStart:
441	if (current == 'u')
442	state = InIdentifierStartUnicodeEscape;
443	else
444	setDone(Bad);
445	break;
446	case InIdentifierPartUnicodeEscapeStart:
447	if (current == 'u')
448	state = InIdentifierPartUnicodeEscape;
449	else
450	setDone(Bad);
451	break;
452	case InIdentifierStartUnicodeEscape:
453	if (!isHexDigit(current) \|\| !isHexDigit(next1) \|\| !isHexDigit(next2) \|\| !isHexDigit(next3)) {
454	setDone(Bad);
455	break;
456	}
457	token = convertUnicode(current, next1, next2, next3).uc;
458	shift(`3`);
459	if (!isIdentStart(token)) {
460	setDone(Bad);
461	break;
462	}
463	record16(token);
464	state = InIdentifier;
465	break;
466	case InIdentifierPartUnicodeEscape:
467	if (!isHexDigit(current) \|\| !isHexDigit(next1) \|\| !isHexDigit(next2) \|\| !isHexDigit(next3)) {
468	setDone(Bad);
469	break;
470	}
471	token = convertUnicode(current, next1, next2, next3).uc;
472	shift(`3`);
473	if (!isIdentPart(token)) {
474	setDone(Bad);
475	break;
476	}
477	record16(token);
478	state = InIdentifier;
479	break;
480	default:
481	assert(!"Unhandled state in switch statement");
482	}
483
484	// move on to the next character
485	if (!done)
486	shift(`1`);
487	#ifndef KJS_PURE_ECMA
488	if (state != Start && state != InMultiLineComment)
489	bol = false;
490	#endif
491	}
492
493	// no identifiers allowed directly after numeric literal, e.g. "3in" is bad
494	if ((state == Number \|\| state == Octal \|\| state == Hex) && isIdentStart(current))
495	state = Bad;
496
497	// terminate string
498	m_buffer8.append('\0');
499
500	#ifdef KJS_DEBUG_LEX
501	fprintf(stderr, "line: %d ", lineNo());
502	fprintf(stderr, "yytext (%x): ", m_buffer8[`0`]);
503	fprintf(stderr, "%s ", m_buffer8.data());
504	#endif
505
506	double dval = `0`;
507	if (state == Number) {
508	dval = kjs_strtod(m_buffer8.data(), `0L`);
509	} else if (state == Hex) { // scan hex numbers
510	const char *p = m_buffer8.data() + `2`;
511	while (char c = *p++) {
512	dval *= `16`;
513	dval += convertHex(c);
514	}
515
516	if (dval >= mantissaOverflowLowerBound)
517	dval = parseIntOverflow(m_buffer8.data() + `2`, p - (m_buffer8.data() + `3`), `16`);
518
519	state = Number;
520	} else if (state == Octal) { // scan octal number
521	const char *p = m_buffer8.data() + `1`;
522	while (char c = *p++) {
523	dval *= `8`;
524	dval += c - '0';
525	}
526
527	if (dval >= mantissaOverflowLowerBound)
528	dval = parseIntOverflow(m_buffer8.data() + `1`, p - (m_buffer8.data() + `2`), `8`);
529
530	state = Number;
531	}
532
533	#ifdef KJS_DEBUG_LEX
534	switch (state) {
535	case Eof:
536	printf("(EOF)\n");
537	break;
538	case Other:
539	printf("(Other)\n");
540	break;
541	case Identifier:
542	printf("(Identifier)/(Keyword)\n");
543	break;
544	case String:
545	printf("(String)\n");
546	break;
547	case Number:
548	printf("(Number)\n");
549	break;
550	default:
551	printf("(unknown)");
552	}
553	#endif
554
555	if (state != Identifier && eatNextIdentifier)
556	eatNextIdentifier = false;
557
558	restrKeyword = false;
559	delimited = false;
560	kjsyylloc.first_line = yylineno; // ???
561	kjsyylloc.last_line = yylineno;
562
563	switch (state) {
564	case Eof:
565	token = `0`;
566	break;
567	case Other:
568	if(token == '}' \|\| token == ';') {
569	delimited = true;
570	}
571	break;
572	case IdentifierOrKeyword:
573	if ((token = Lookup::find(&mainTable, m_buffer16.data(), m_buffer16.size())) < `0`) {
574	case Identifier:
575	// Lookup for keyword failed, means this is an identifier
576	// Apply anonymous-function hack below (eat the identifier)
577	if (eatNextIdentifier) {
578	eatNextIdentifier = false;
579	token = lex();
580	break;
581	}
582	kjsyylval.ident = makeIdentifier(m_buffer16);
583	token = IDENT;
584	break;
585	}
586
587	eatNextIdentifier = false;
588	// Hack for "f = function somename() { ... }", too hard to get into the grammar
589	if (token == FUNCTION && lastToken == '=' )
590	eatNextIdentifier = true;
591
592	if (token == CONTINUE \|\| token == BREAK \|\|
593	token == RETURN \|\| token == THROW)
594	restrKeyword = true;
595	break;
596	case String:
597	kjsyylval.ustr = makeUString(m_buffer16);
598	token = STRING;
599	break;
600	case Number:
601	kjsyylval.dval = dval;
602	token = NUMBER;
603	break;
604	case Bad:
605	#ifdef KJS_DEBUG_LEX
606	fprintf(stderr, "KJS: yylex: ERROR.\n");
607	#endif
608	error = true;
609	return -`1`;
610	default:
611	assert(!"unhandled numeration value in switch");
612	error = true;
613	return -`1`;
614	}
615	lastToken = token;
616	return token;
617	}
618
619	bool Lexer::isWhiteSpace() const
620	{
621	return CommonUnicode::isWhiteSpace(current);
622	}
623
624	bool Lexer::isLineTerminator()
625	{
626	bool cr = (current == '\r');
627	bool lf = (current == '\n');
628	if (cr)
629	skipLF = true;
630	else if (lf)
631	skipCR = true;
632	return cr \|\| lf \|\| current == `0x2028` \|\| current == `0x2029`;
633	}
634
635	typedef bool (CharacterCheck)(int c);
636
637	static bool isIdentStartLibC(int c)
638	{
639	return (category(c) & (Letter_Uppercase \| Letter_Lowercase \|
640	Letter_Titlecase \| Letter_Modifier \| Letter_Other))
641	\|\| c == '$' \|\| c == '_';
642	}
643
644	static bool isIdentPartLibC(int c)
645	{
646	return (category(c) & (Letter_Uppercase \| Letter_Lowercase \|
647	Letter_Titlecase \| Letter_Modifier \| Letter_Other \|
648	Mark_NonSpacing \| Mark_SpacingCombining \|
649	Number_DecimalDigit \| Punctuation_Connector))
650	\|\| c == '$' \|\| c == '_';
651	}
652
653	static CharacterCheck *identStart = ::isIdentStartLibC;
654	static CharacterCheck *identPart = ::isIdentPartLibC;
655
656	void Lexer::setIdentStartChecker(bool (f)(int* c))
657	{
658	identStart = f;
659	}
660
661	void Lexer::setIdentPartChecker(bool (f)(int* c))
662	{
663	identPart = f;
664	}
665
666	bool Lexer::isIdentStart(int c)
667	{
668	return (*identStart)(c);
669	}
670
671	bool Lexer::isIdentPart(int c)
672	{
673	return (*identPart)(c);
674	}
675
676	static bool isDecimalDigit(int c)
677	{
678	return (c >= '0' && c <= '9');
679	}
680
681	bool Lexer::isHexDigit(int c)
682	{
683	return ((c >= '0' && c <= '9') \|\|
684	(c >= 'a' && c <= 'f') \|\|
685	(c >= 'A' && c <= 'F'));
686	}
687
688	bool Lexer::isOctalDigit(int c)
689	{
690	return (c >= '0' && c <= '7');
691	}
692
693	int Lexer::matchPunctuator(int c1, int c2, int c3, int c4)
694	{
695	if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
696	shift(`4`);
697	return URSHIFTEQUAL;
698	} else if (c1 == '=' && c2 == '=' && c3 == '=') {
699	shift(`3`);
700	return STREQ;
701	} else if (c1 == '!' && c2 == '=' && c3 == '=') {
702	shift(`3`);
703	return STRNEQ;
704	} else if (c1 == '>' && c2 == '>' && c3 == '>') {
705	shift(`3`);
706	return URSHIFT;
707	} else if (c1 == '<' && c2 == '<' && c3 == '=') {
708	shift(`3`);
709	return LSHIFTEQUAL;
710	} else if (c1 == '>' && c2 == '>' && c3 == '=') {
711	shift(`3`);
712	return RSHIFTEQUAL;
713	} else if (c1 == '<' && c2 == '=') {
714	shift(`2`);
715	return LE;
716	} else if (c1 == '>' && c2 == '=') {
717	shift(`2`);
718	return GE;
719	} else if (c1 == '!' && c2 == '=') {
720	shift(`2`);
721	return NE;
722	} else if (c1 == '+' && c2 == '+') {
723	shift(`2`);
724	if (terminator)
725	return AUTOPLUSPLUS;
726	else
727	return PLUSPLUS;
728	} else if (c1 == '-' && c2 == '-') {
729	shift(`2`);
730	if (terminator)
731	return AUTOMINUSMINUS;
732	else
733	return MINUSMINUS;
734	} else if (c1 == '=' && c2 == '=') {
735	shift(`2`);
736	return EQEQ;
737	} else if (c1 == '+' && c2 == '=') {
738	shift(`2`);
739	return PLUSEQUAL;
740	} else if (c1 == '-' && c2 == '=') {
741	shift(`2`);
742	return MINUSEQUAL;
743	} else if (c1 == '*' && c2 == '=') {
744	shift(`2`);
745	return MULTEQUAL;
746	} else if (c1 == '/' && c2 == '=') {
747	shift(`2`);
748	return DIVEQUAL;
749	} else if (c1 == '&' && c2 == '=') {
750	shift(`2`);
751	return ANDEQUAL;
752	} else if (c1 == '^' && c2 == '=') {
753	shift(`2`);
754	return XOREQUAL;
755	} else if (c1 == '%' && c2 == '=') {
756	shift(`2`);
757	return MODEQUAL;
758	} else if (c1 == '\|' && c2 == '=') {
759	shift(`2`);
760	return OREQUAL;
761	} else if (c1 == '<' && c2 == '<') {
762	shift(`2`);
763	return LSHIFT;
764	} else if (c1 == '>' && c2 == '>') {
765	shift(`2`);
766	return RSHIFT;
767	} else if (c1 == '&' && c2 == '&') {
768	shift(`2`);
769	return AND;
770	} else if (c1 == '\|' && c2 == '\|') {
771	shift(`2`);
772	return OR;
773	}
774
775	switch(c1) {
776	case '=':
777	case '>':
778	case '<':
779	case ',':
780	case '!':
781	case '~':
782	case '?':
783	case ':':
784	case '.':
785	case '+':
786	case '-':
787	case '*':
788	case '/':
789	case '&':
790	case '\|':
791	case '^':
792	case '%':
793	case '(':
794	case ')':
795	case '{':
796	case '}':
797	case '[':
798	case ']':
799	case ';':
800	shift(`1`);
801	return static_cast<int>(c1);
802	default:
803	return -`1`;
804	}
805	}
806
807	unsigned short Lexer::singleEscape(unsigned short c)
808	{
809	switch(c) {
810	case 'b':
811	return `0x08`;
812	case 't':
813	return `0x09`;
814	case 'n':
815	return `0x0A`;
816	case 'v':
817	return `0x0B`;
818	case 'f':
819	return `0x0C`;
820	case 'r':
821	return `0x0D`;
822	case '"':
823	return `0x22`;
824	case '\'':
825	return `0x27`;
826	case '\\':
827	return `0x5C`;
828	default:
829	return c;
830	}
831	}
832
833	unsigned short Lexer::convertOctal(int c1, int c2, int c3)
834	{
835	return static_cast<unsigned short>((c1 - '0') * `64` + (c2 - '0') * `8` + c3 - '0');
836	}
837
838	unsigned char Lexer::convertHex(int c)
839	{
840	if (c >= '0' && c <= '9')
841	return static_cast<unsigned char>(c - '0');
842	if (c >= 'a' && c <= 'f')
843	return static_cast<unsigned char>(c - 'a' + `10`);
844	return static_cast<unsigned char>(c - 'A' + `10`);
845	}
846
847	unsigned char Lexer::convertHex(int c1, int c2)
848	{
849	return ((convertHex(c1) << `4`) + convertHex(c2));
850	}
851
852	KJS::UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4)
853	{
854	return KJS::UChar ((convertHex(c1) << `4`) + convertHex(c2),
855	(convertHex(c3) << `4`) + convertHex(c4));
856	}
857
858	void Lexer::record8(int c)
859	{
860	ASSERT(c >= `0`);
861	ASSERT(c <= `0xff`);
862	m_buffer8.append(c);
863	}
864
865	void Lexer::record16(int c)
866	{
867	ASSERT(c >= `0`);
868	ASSERT(c <= USHRT_MAX);
869	record16(UChar (static_cast<unsigned short>(c)));
870	}
871
872	void Lexer::record16(KJS::UChar c)
873	{
874	m_buffer16.append(c);
875	}
876
877	bool Lexer::scanRegExp()
878	{
879	m_buffer16.clear();
880	bool lastWasEscape = false;
881	bool inBrackets = false;
882
883	while (`1`) {
884	if (isLineTerminator() \|\| current == -`1`)
885	return false;
886	else if (current != '/' \|\| lastWasEscape == true \|\| inBrackets == true)
887	{
888	// keep track of '[' and ']'
889	if (!lastWasEscape) {
890	if ( current == '[' && !inBrackets )
891	inBrackets = true;
892	if ( current == ']' && inBrackets )
893	inBrackets = false;
894	}
895	record16(current);
896	lastWasEscape =
897	!lastWasEscape && (current == '\\');
898	} else { // end of regexp
899	m_pattern = UString(m_buffer16);
900	m_buffer16.clear();
901	shift(`1`);
902	break;
903	}
904	shift(`1`);
905	}
906
907	while (isIdentPart(current)) {
908	record16(current);
909	shift(`1`);
910	}
911	m_flags = UString(m_buffer16);
912
913	return true;
914	}
915
916
917	void Lexer::clear()
918	{
919	deleteAllValues(m_strings);
920	Vector<UString*> newStrings;
921	newStrings.reserveCapacity(initialStringTableCapacity);
922	m_strings.swap(newStrings);
923	deleteAllValues(m_identifiers);
924	Vector<KJS::Identifier*> newIdentifiers;
925	newIdentifiers.reserveCapacity(initialStringTableCapacity);
926	m_identifiers.swap(newIdentifiers);
927
928	Vector<char> newBuffer8;
929	newBuffer8.reserveCapacity(initialReadBufferCapacity);
930	m_buffer8.swap(newBuffer8);
931
932	Vector<UChar> newBuffer16;
933	newBuffer16.reserveCapacity(initialReadBufferCapacity);
934	m_buffer16.swap(newBuffer16);
935
936	m_pattern = `0`;
937	m_flags = `0`;
938	m_sourceURL = `0`;
939	}
940
941	Identifier* Lexer::makeIdentifier(const Vector<KJS::UChar>& buffer)
942	{
943	KJS::Identifier* identifier = new KJS::Identifier (buffer.data(), buffer.size());
944	m_identifiers.append(identifier);
945	return identifier;
946	}
947
948	UString* Lexer::makeUString(const Vector<KJS::UChar>& buffer)
949	{
950	UString* string = new UString (buffer);
951	m_strings.append(string);
952	return string;
953	}
954
955	} // namespace KJS
956