jsonlexer.cpp [kdelibs/kjs/jsonlexer.cpp]

1	/*
2	* This file is part of the KDE libraries
3	* Copyright (C) 2012 Bernd Buschinski (b.buschinski@googlemail.com)
4	*
5	* This library is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU Library General Public
7	* License as published by the Free Software Foundation; either
8	* version 2 of the License, or (at your option) any later version.
9	*
10	* This library is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	* Library General Public License for more details.
14	*
15	* You should have received a copy of the GNU Library General Public License
16	* along with this library; see the file COPYING.LIB. If not, write to
17	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18	* Boston, MA 02110-1301, USA.
19	*
20	*/
21
22	#include "jsonlexer.h"
23
24	#include <stack>
25
26	#include "lexer.h"
27	#include "object.h"
28
29	#include "wtf/Assertions.h"
30
31	// #define JSONLEXER_DEBUG_VERBOSE
32
33	namespace KJS {
34
35	using namespace JSONParserState;
36
37	static const unsigned short InvalidJSONUnicode = `0x001F`;
38
39	static inline bool isDecimalDigit(const UChar &c)
40	{
41	return (c.uc >= '0' && c.uc <= '9');
42	}
43
44	static inline bool isHexDigit(const UChar& c)
45	{
46	return (isDecimalDigit(c) \|\|
47	(c.uc >= 'a' && c.uc <= 'f') \|\|
48	(c.uc >= 'A' && c.uc <= 'F'));
49	}
50
51	static inline bool isJSONWhiteSpace(const UChar& c)
52	{
53	//ECMA Edition 5.1r6 - 15.12.1.1 - Syntax
54	switch (c.uc) {
55	case `0x0020`: //SP
56	case `0x0009`: //TAB
57	case `0x000A`: //LF
58	case `0x000D`: //CR
59	return true;
60	default:
61	return false;
62	}
63	}
64
65	#ifdef JSONLEXER_DEBUG_VERBOSE
66	static inline UString tokenToString(TokenType type)
67	{
68	switch (type) {
69	case TokLBracket: return UString("TokLBracket");
70	case TokRBracket: return UString("TokRBracket");
71	case TokLBrace: return UString("TokLBrace");
72	case TokRBrace: return UString("TokRBrace");
73	case TokString: return UString("TokString");
74	case TokIdentifier: return UString("TokIdentifier");
75	case TokNumber: return UString("TokNumber");
76	case TokColon: return UString("TokColon");
77	case TokLParen: return UString("TokLParen");
78	case TokRParen: return UString("TokRParen");
79	case TokComma: return UString("TokComma");
80	case TokTrue: return UString("TokTrue");
81	case TokFalse: return UString("TokFalse");
82	case TokNull: return UString("TokNull");
83	case TokEnd: return UString("TokEnd");
84	case TokError: return UString("TokError");
85	}
86	ASSERT_NOT_REACHED();
87	return UString("Default");
88	}
89
90	static inline UString parserStateToString(ParserState state)
91	{
92	switch (state) {
93	case JSONValue: return UString("JSONValue");
94	case JSONObject: return UString("JSONObject");
95	case JSONArray: return UString("JSONArray");
96	}
97	ASSERT_NOT_REACHED();
98	return UString("Default");
99	}
100	#endif
101
102
103	// ------------------------------ JSONParser --------------------------------
104
105	JSONParser::JSONParser(const UString& code)
106	: m_lexer (code)
107	{
108	#ifdef JSONLEXER_DEBUG_VERBOSE
109	fprintf(stderr, "=============== new JSONParser ===============\n%s\n===============\n", code.ascii());
110	#endif
111	}
112
113	JSValue* JSONParser::tryParse(ExecState* exec)
114	{
115	JSValue* ret = parse(exec);
116	// If the syntax is correct, we never see the EOF, the last used token may be '}'.
117	// But Syntax like "{} xyz" is also invalid, so we have to check if the next(last) token is EOF
118	if (ret && nextParseIsEOF())
119	return ret;
120	return `0`;
121	}
122
123	// helper function for adding a value to the object.
124	// the arrayStack saves all added values and gives the correct array position.
125	// This function will return false for NULL value or on exception.
126	static inline bool addArrayItem(ExecState* exec, std::stack<JSValue> arrayStack, JSValue* value, JSObject* object)
127	{
128	if (exec->hadException())
129	return false;
130
131	if (!value)
132	return false;
133
134	arrayStack->push(value);
135	object->put(exec, arrayStack->size()-`1`, value);
136	return true;
137	}
138
139	JSValue* JSONParser::parse(ExecState* exec, ParserState state)
140	{
141	if (exec->hadException())
142	return `0`;
143
144	ParserState tState = state;
145	TokenType type = m_lexer.next();
146
147	JSObject* object = `0`;
148	std::stack<JSValue*> arrayObjectStack;
149	UString propertyName;
150
151	// For parsing the Object, did we found a propertyName?
152	// NOTE: empty propertynames are allowed.
153	bool havePropertyName = false;
154	// For parsing the Object/Array, checks if we really added/found a propertyName
155	// before we find the comma ','
156	bool propAdded = false;
157	// For parsing the Array, remember if last found token is Comma
158	bool lastFoundIsTokComma = false;
159
160	while (type != TokEnd && type != TokError) {
161	#ifdef JSONLEXER_DEBUG_VERBOSE
162	fprintf(stderr, "TokenType: %s \t State: %s\n", tokenToString(type).ascii(), parserStateToString(tState).ascii());
163	#endif
164
165	switch (tState) {
166	case JSONValue:
167	switch (type) {
168	case TokLBracket:
169	object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty()));
170	tState = JSONArray;
171	break;
172	case TokLBrace:
173	object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty()));
174	tState = JSONObject;
175	break;
176	case TokString:
177	return jsString(m_lexer.currentString());
178	case TokNull:
179	return jsNull();
180	case TokTrue:
181	return jsBoolean(true);
182	case TokFalse:
183	return jsBoolean(false);
184	case TokNumber:
185	return jsNumber(m_lexer.currentNumber());
186	default:
187	// This can only happen on invalid syntax and with 0 return
188	// we tell the caller we got a syntax error.
189
190	// ASSERT_NOT_REACHED();
191	return `0`;
192	}
193	break;
194	case JSONObject: {
195	// if we got called from JSONArray-TokLBrace we did not create an object.
196
197	// In more detail for the following JSON String "[{}]"
198	// If we are in parse with type=JSONArray and state=TokLBrace,
199	// means we just found the "{" in the Array, and call parse(exec, JSONObject),
200	// now in this call type=JSONObject, state=TokRBrace ("}") and, our new, object=0 (!)
201	// We will finish the object and return it, but as object is null, we return 0.
202	// which would be wrong, as empty objects are allowed.
203	// In this case we just report invalid data.
204	// But for JSON String like "[{"a":1}]", we end up using object(0)->putDirect
205	// and crash.
206
207	// In short, remove this line and we will crash.
208	object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty()));
209	switch (type) {
210	case TokString: // PropertyName
211	if (havePropertyName)
212	return `0`;
213	propertyName = m_lexer.currentString();
214	havePropertyName = true;
215	break;
216	case TokColon: {
217	if (!havePropertyName)
218	return `0`;
219	JSValue* val = parse(exec, JSONValue);
220	if (!val)
221	return `0`;
222
223	// use putDirect to by-pass __proto__
224	object->putDirect(Identifier(propertyName), val);
225	propertyName = "";
226	havePropertyName = false;
227	propAdded = true;
228	break;
229	}
230	case TokRBrace: //Finish Object
231	if (havePropertyName)
232	return `0`;
233	return object;
234	case TokComma: // Next Property
235	if (!propAdded)
236	return `0`;
237	propAdded = false;
238	break;
239	default:
240	// This can only happen on invalid syntax and with 0 return
241	// we tell the caller we got a syntax error.
242
243	// ASSERT_NOT_REACHED();
244	return `0`;
245	}
246	break;
247	}
248	case JSONArray: {
249	// if we got called from JSONArray-TokLBracket we did not create an object
250	object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty()));
251
252	// Check for invalid Array syntax, like ["1" "2"]
253	switch (type) {
254	case TokNumber:
255	case TokString:
256	case TokNull:
257	case TokTrue:
258	case TokFalse:
259	case TokLBrace:
260	case TokLBracket:
261	if (propAdded)
262	return `0`;
263	propAdded = true;
264	lastFoundIsTokComma = false;
265	break;
266	default:
267	break;
268	}
269
270	switch (type) {
271	case TokRBracket: // Finish array
272	// Check for invalid syntax like "[1,]"
273	if (lastFoundIsTokComma)
274	return `0`;
275	return object;
276	case TokNumber:
277	if (!addArrayItem(exec, &arrayObjectStack, jsNumber(m_lexer.currentNumber()), object))
278	return `0`;
279	break;
280	case TokString:
281	if (!addArrayItem(exec, &arrayObjectStack, jsString(m_lexer.currentString()), object))
282	return `0`;
283	break;
284	case TokNull:
285	if (!addArrayItem(exec, &arrayObjectStack, jsNull(), object))
286	return `0`;
287	break;
288	case TokTrue:
289	if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(true), object))
290	return `0`;
291	break;
292	case TokFalse:
293	if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(false), object))
294	return `0`;
295	break;
296	case TokLBrace:
297	if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONObject), object))
298	return `0`;
299	break;
300	case TokLBracket:
301	if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONArray), object))
302	return `0`;
303	break;
304	case TokComma: // Skip Comma and parse next Array Element
305	// if we found a comma without a leading property, this is invalid syntax
306	if (!propAdded)
307	return `0`;
308	propAdded = false;
309	lastFoundIsTokComma = true;
310	break;
311	default:
312	// This can only happen on invalid syntax and with 0 return
313	// we tell the caller we got a syntax error.
314
315	// ASSERT_NOT_REACHED();
316	return `0`;
317	}
318	break;
319	}
320	default:
321	ASSERT_NOT_REACHED();
322	return `0`;
323	}
324	type = m_lexer.next();
325	}
326
327	if (type == TokError) {
328	#ifdef JSONLEXER_DEBUG_VERBOSE
329	fprintf(stderr, "WARNING: JSONParse ending with error!\n");
330	#endif
331	return `0`;
332	}
333	if (type == TokEnd) {
334	#ifdef JSONLEXER_DEBUG_VERBOSE
335	fprintf(stderr, "WARNING: JSONParse ending with unexpected END!\n");
336	#endif
337	return `0`;
338	}
339	ASSERT_NOT_REACHED();
340	return `0`;
341	}
342
343
344	// ------------------------------ JSONLexer --------------------------------
345
346	JSONLexer::JSONLexer(const UString& code)
347	: m_code (code),
348	m_pos(`0`)
349	{
350	}
351
352	TokenType JSONLexer::current()
353	{
354	return m_type;
355	}
356
357	double JSONLexer::currentNumber() const
358	{
359	ASSERT(m_type == TokNumber);
360	return m_numberToken;
361	}
362
363	UString JSONLexer::currentString() const
364	{
365	ASSERT(m_type == TokString);
366	return m_stringToken;
367	}
368
369	TokenType JSONLexer::lexString()
370	{
371	UString string;
372	const int codeSize = m_code.size();
373	//skip first detected '"'
374	++m_pos;
375
376	if (m_pos >= codeSize) {
377	m_type = TokError;
378	return m_type;
379	}
380
381	//while not at the end of the string '"'
382	while (!(m_code [m_pos] == '"')) {
383	UChar cur = m_code [m_pos];
384	if (cur == UChar ('\\')) {
385	++m_pos;
386	bool error = false;
387	string.append(parseEscapeChar(&error));
388	if (error) {
389	m_type = TokError;
390	return m_type;
391	}
392	} else {
393	if (cur.uc <= InvalidJSONUnicode) {
394	m_type = TokError;
395	return m_type;
396	}
397	string.append(cur);
398	++m_pos;
399	}
400
401	if (m_pos >= codeSize) {
402	m_type = TokError;
403	return m_type;
404	}
405	}
406
407	m_type = TokString;
408	m_stringToken = string;
409	++m_pos;
410	#ifdef JSONLEXER_DEBUG_VERBOSE
411	fprintf(stderr, "JSONLexer::lexString: Pos:%d stringlength:%d string:%s\n", m_pos, string.size(), string.ascii());
412	#endif
413	return m_type;
414	}
415
416	TokenType JSONLexer::lexNumber()
417	{
418	const int start = m_pos;
419	const int codeSize = m_code.size();
420
421	// -?(0 \| [1-9][0-9]) ('.' [0-9]+)? ([eE][+-]? [0-9]+)?*
422
423	// -?
424	if (m_pos < codeSize && m_code [m_pos] == '-')
425	++m_pos;
426
427	// (0 \| [1-9][0-9])*
428	if (m_pos < codeSize && m_code [m_pos] == '0') {
429	++m_pos;
430	} else if (m_pos < codeSize) {
431	while (m_pos < codeSize && isDecimalDigit(m_code [m_pos])) {
432	++m_pos;
433	}
434	} else {
435	m_type = TokError;
436	return m_type;
437	}
438
439	// ('.' [0-9]+)?
440	if (m_pos < codeSize && m_code [m_pos] == '.') {
441	++m_pos;
442	// [0-9]+
443	if (m_pos >= codeSize \|\| !isDecimalDigit(m_code [m_pos])) {
444	m_type = TokError;
445	return m_type;
446	}
447	++m_pos;
448
449	while (m_pos < codeSize && isDecimalDigit(m_code [m_pos]))
450	++m_pos;
451	}
452
453	// ([eE][+-]? [0-9]+)?
454	if (m_pos < codeSize && (m_code [m_pos] == 'e' \|\| m_code [m_pos] == 'E')) { // [eE]
455	++m_pos;
456
457	// [-+]?
458	if (m_pos < codeSize && (m_code [m_pos] == '-' \|\| m_code [m_pos] == '+'))
459	++m_pos;
460
461	// [0-9]+
462	if (m_pos >= codeSize \|\| !isDecimalDigit(m_code [m_pos])) {
463	m_type = TokError;
464	return m_type;
465	}
466
467	++m_pos;
468	while (m_pos < codeSize && isDecimalDigit(m_code [m_pos]))
469	++m_pos;
470	}
471
472	m_numberToken = m_code.substr(start, m_pos-start).toDouble(false, false);
473	m_type = TokNumber;
474	#ifdef JSONLEXER_DEBUG_VERBOSE
475	fprintf(stderr, "Number: %f\n", m_numberToken);
476	#endif
477	return m_type;
478	}
479
480	UChar JSONLexer::parseEscapeChar(bool* error)
481	{
482	UChar cur = m_code [m_pos];
483	switch (cur.uc) {
484	case '"':
485	case '\\':
486	case '/':
487	++m_pos;
488	return cur;
489	case 'b':
490	++m_pos;
491	return UChar ('\b');
492	case 'f':
493	++m_pos;
494	return UChar ('\f');
495	case 'n':
496	++m_pos;
497	return UChar ('\n');
498	case 'r':
499	++m_pos;
500	return UChar ('\r');
501	case 't':
502	++m_pos;
503	return UChar ('\t');
504	case 'u':
505	{
506	if ((m_code.size() - (m_pos+`1`)) < `4`) {
507	error = true*;
508	return UChar (' ');
509	}
510	if (!isHexDigit(m_code [m_pos+`1`]) \|\| !isHexDigit(m_code [m_pos+`2`]) \|\|
511	!isHexDigit(m_code [m_pos+`3`]) \|\| !isHexDigit(m_code [m_pos+`4`])) {
512	error = true*;
513	return UChar (' ');
514	}
515
516	UChar next = Lexer::convertUnicode(m_code [m_pos+`1`].uc, m_code [m_pos+`2`].uc, m_code [m_pos+`3`].uc, m_code [m_pos+`4`].uc);
517	if (next.uc <= InvalidJSONUnicode)
518	{
519	error = true*;
520	return UChar (' ');
521	}
522
523	error = false*;
524	m_pos += `5`;
525	return next;
526	}
527	default:
528	error = true*;
529	return UChar (' ');
530	}
531	}
532
533	//helper function, checks if "word" is in the "code" at "pos".
534	static inline bool isStringSequence(int pos, const UString& code, const UString& word)
535	{
536	const int wordSize = word.size();
537	if (pos + wordSize > code.size())
538	return false;
539
540	//Skip first, we already checked it
541	for (int i = `1`; i < wordSize; ++i) {
542	if (code [pos+i].uc != word [i].uc)
543	return false;
544	}
545	return true;
546	}
547
548	TokenType JSONLexer::next()
549	{
550	while(true) {
551	if (m_pos >= m_code.size()) {
552	m_type = TokEnd;
553	return m_type;
554	}
555
556	if (!isJSONWhiteSpace(m_code [m_pos])) {
557	break;
558	}
559	++m_pos;
560	}
561
562	m_type = TokError;
563
564	#ifdef JSONLEXER_DEBUG_VERBOSE
565	fprintf(stderr, "JSONLexer::next current: %c \t\t pos: %d/%d\n", char(m_code[m_pos].uc), m_pos, m_code.size());
566	#endif
567
568	switch (m_code [m_pos].uc) {
569	case '[':
570	m_type = TokLBracket;
571	++m_pos;
572	return m_type;
573	case ']':
574	m_type = TokRBracket;
575	++m_pos;
576	return m_type;
577	case '(':
578	m_type = TokLParen;
579	++m_pos;
580	return m_type;
581	case ')':
582	m_type = TokRParen;
583	++m_pos;
584	return m_type;
585	case '{':
586	m_type = TokLBrace;
587	++m_pos;
588	return m_type;
589	case '}':
590	m_type = TokRBrace;
591	++m_pos;
592	return m_type;
593	case ',':
594	m_type = TokComma;
595	++m_pos;
596	return m_type;
597	case ':':
598	m_type = TokColon;
599	++m_pos;
600	return m_type;
601	case '"':
602	return lexString();
603
604	case 't':
605	if (isStringSequence(m_pos, m_code, "true")) {
606	m_type = TokTrue;
607	m_pos += `4`;
608	return m_type;
609	}
610	break;
611	case 'f':
612	if (isStringSequence(m_pos, m_code, "false")) {
613	m_type = TokFalse;
614	m_pos += `5`;
615	return m_type;
616	}
617	break;
618	case 'n':
619	if (isStringSequence(m_pos, m_code, "null")) {
620	m_type = TokNull;
621	m_pos += `4`;
622	return m_type;
623	}
624	break;
625	case '-':
626	case '0':
627	case '1':
628	case '2':
629	case '3':
630	case '4':
631	case '5':
632	case '6':
633	case '7':
634	case '8':
635	case '9':
636	return lexNumber();
637	}
638	return m_type;
639	}
640
641	bool JSONParser::nextParseIsEOF()
642	{
643	return m_lexer.next() == TokEnd;
644	}
645
646
647	} // namespace KJS
648