1/*
2 * This file is part of the KDE libraries
3 * Copyright (C) 2012 Bernd Buschinski (b.buschinski@googlemail.com)
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 *
20 */
21
22#include "jsonlexer.h"
23
24#include <stack>
25
26#include "lexer.h"
27#include "object.h"
28
29#include "wtf/Assertions.h"
30
31// #define JSONLEXER_DEBUG_VERBOSE
32
33namespace KJS {
34
35using namespace JSONParserState;
36
37static const unsigned short InvalidJSONUnicode = 0x001F;
38
39static inline bool isDecimalDigit(const UChar &c)
40{
41 return (c.uc >= '0' && c.uc <= '9');
42}
43
44static inline bool isHexDigit(const UChar& c)
45{
46 return (isDecimalDigit(c) ||
47 (c.uc >= 'a' && c.uc <= 'f') ||
48 (c.uc >= 'A' && c.uc <= 'F'));
49}
50
51static inline bool isJSONWhiteSpace(const UChar& c)
52{
53 //ECMA Edition 5.1r6 - 15.12.1.1 - Syntax
54 switch (c.uc) {
55 case 0x0020: //SP
56 case 0x0009: //TAB
57 case 0x000A: //LF
58 case 0x000D: //CR
59 return true;
60 default:
61 return false;
62 }
63}
64
65#ifdef JSONLEXER_DEBUG_VERBOSE
66static inline UString tokenToString(TokenType type)
67{
68 switch (type) {
69 case TokLBracket: return UString("TokLBracket");
70 case TokRBracket: return UString("TokRBracket");
71 case TokLBrace: return UString("TokLBrace");
72 case TokRBrace: return UString("TokRBrace");
73 case TokString: return UString("TokString");
74 case TokIdentifier: return UString("TokIdentifier");
75 case TokNumber: return UString("TokNumber");
76 case TokColon: return UString("TokColon");
77 case TokLParen: return UString("TokLParen");
78 case TokRParen: return UString("TokRParen");
79 case TokComma: return UString("TokComma");
80 case TokTrue: return UString("TokTrue");
81 case TokFalse: return UString("TokFalse");
82 case TokNull: return UString("TokNull");
83 case TokEnd: return UString("TokEnd");
84 case TokError: return UString("TokError");
85 }
86 ASSERT_NOT_REACHED();
87 return UString("Default");
88}
89
90static inline UString parserStateToString(ParserState state)
91{
92 switch (state) {
93 case JSONValue: return UString("JSONValue");
94 case JSONObject: return UString("JSONObject");
95 case JSONArray: return UString("JSONArray");
96 }
97 ASSERT_NOT_REACHED();
98 return UString("Default");
99}
100#endif
101
102
103// ------------------------------ JSONParser --------------------------------
104
105JSONParser::JSONParser(const UString& code)
106 : m_lexer(code)
107{
108#ifdef JSONLEXER_DEBUG_VERBOSE
109 fprintf(stderr, "=============== new JSONParser ===============\n%s\n===============\n", code.ascii());
110#endif
111}
112
113JSValue* JSONParser::tryParse(ExecState* exec)
114{
115 JSValue* ret = parse(exec);
116 // If the syntax is correct, we never see the EOF, the last used token may be '}'.
117 // But Syntax like "{} xyz" is also invalid, so we have to check if the next(last) token is EOF
118 if (ret && nextParseIsEOF())
119 return ret;
120 return 0;
121}
122
123// helper function for adding a value to the object.
124// the arrayStack saves all added values and gives the correct array position.
125// This function will return false for NULL value or on exception.
126static inline bool addArrayItem(ExecState* exec, std::stack<JSValue*>* arrayStack, JSValue* value, JSObject* object)
127{
128 if (exec->hadException())
129 return false;
130
131 if (!value)
132 return false;
133
134 arrayStack->push(value);
135 object->put(exec, arrayStack->size()-1, value);
136 return true;
137}
138
139JSValue* JSONParser::parse(ExecState* exec, ParserState state)
140{
141 if (exec->hadException())
142 return 0;
143
144 ParserState tState = state;
145 TokenType type = m_lexer.next();
146
147 JSObject* object = 0;
148 std::stack<JSValue*> arrayObjectStack;
149 UString propertyName;
150
151 // For parsing the Object, did we found a propertyName?
152 // NOTE: empty propertynames are allowed.
153 bool havePropertyName = false;
154 // For parsing the Object/Array, checks if we really added/found a propertyName
155 // before we find the comma ','
156 bool propAdded = false;
157 // For parsing the Array, remember if last found token is Comma
158 bool lastFoundIsTokComma = false;
159
160 while (type != TokEnd && type != TokError) {
161#ifdef JSONLEXER_DEBUG_VERBOSE
162 fprintf(stderr, "TokenType: %s \t State: %s\n", tokenToString(type).ascii(), parserStateToString(tState).ascii());
163#endif
164
165 switch (tState) {
166 case JSONValue:
167 switch (type) {
168 case TokLBracket:
169 object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty()));
170 tState = JSONArray;
171 break;
172 case TokLBrace:
173 object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty()));
174 tState = JSONObject;
175 break;
176 case TokString:
177 return jsString(m_lexer.currentString());
178 case TokNull:
179 return jsNull();
180 case TokTrue:
181 return jsBoolean(true);
182 case TokFalse:
183 return jsBoolean(false);
184 case TokNumber:
185 return jsNumber(m_lexer.currentNumber());
186 default:
187 // This can only happen on invalid syntax and with 0 return
188 // we tell the caller we got a syntax error.
189
190 // ASSERT_NOT_REACHED();
191 return 0;
192 }
193 break;
194 case JSONObject: {
195 // if we got called from JSONArray-TokLBrace we did not create an object.
196
197 // In more detail for the following JSON String "[{}]"
198 // If we are in parse with type=JSONArray and state=TokLBrace,
199 // means we just found the "{" in the Array, and call parse(exec, JSONObject),
200 // now in this call type=JSONObject, state=TokRBrace ("}") and, our new, object=0 (!)
201 // We will finish the object and return it, but as object is null, we return 0.
202 // which would be wrong, as empty objects are allowed.
203 // In this case we just report invalid data.
204 // But for JSON String like "[{"a":1}]", we end up using object(0)->putDirect
205 // and crash.
206
207 // In short, remove this line and we will crash.
208 object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty()));
209 switch (type) {
210 case TokString: // PropertyName
211 if (havePropertyName)
212 return 0;
213 propertyName = m_lexer.currentString();
214 havePropertyName = true;
215 break;
216 case TokColon: {
217 if (!havePropertyName)
218 return 0;
219 JSValue* val = parse(exec, JSONValue);
220 if (!val)
221 return 0;
222
223 // use putDirect to by-pass __proto__
224 object->putDirect(Identifier(propertyName), val);
225 propertyName = "";
226 havePropertyName = false;
227 propAdded = true;
228 break;
229 }
230 case TokRBrace: //Finish Object
231 if (havePropertyName)
232 return 0;
233 return object;
234 case TokComma: // Next Property
235 if (!propAdded)
236 return 0;
237 propAdded = false;
238 break;
239 default:
240 // This can only happen on invalid syntax and with 0 return
241 // we tell the caller we got a syntax error.
242
243 // ASSERT_NOT_REACHED();
244 return 0;
245 }
246 break;
247 }
248 case JSONArray: {
249 // if we got called from JSONArray-TokLBracket we did not create an object
250 object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty()));
251
252 // Check for invalid Array syntax, like ["1" "2"]
253 switch (type) {
254 case TokNumber:
255 case TokString:
256 case TokNull:
257 case TokTrue:
258 case TokFalse:
259 case TokLBrace:
260 case TokLBracket:
261 if (propAdded)
262 return 0;
263 propAdded = true;
264 lastFoundIsTokComma = false;
265 break;
266 default:
267 break;
268 }
269
270 switch (type) {
271 case TokRBracket: // Finish array
272 // Check for invalid syntax like "[1,]"
273 if (lastFoundIsTokComma)
274 return 0;
275 return object;
276 case TokNumber:
277 if (!addArrayItem(exec, &arrayObjectStack, jsNumber(m_lexer.currentNumber()), object))
278 return 0;
279 break;
280 case TokString:
281 if (!addArrayItem(exec, &arrayObjectStack, jsString(m_lexer.currentString()), object))
282 return 0;
283 break;
284 case TokNull:
285 if (!addArrayItem(exec, &arrayObjectStack, jsNull(), object))
286 return 0;
287 break;
288 case TokTrue:
289 if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(true), object))
290 return 0;
291 break;
292 case TokFalse:
293 if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(false), object))
294 return 0;
295 break;
296 case TokLBrace:
297 if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONObject), object))
298 return 0;
299 break;
300 case TokLBracket:
301 if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONArray), object))
302 return 0;
303 break;
304 case TokComma: // Skip Comma and parse next Array Element
305 // if we found a comma without a leading property, this is invalid syntax
306 if (!propAdded)
307 return 0;
308 propAdded = false;
309 lastFoundIsTokComma = true;
310 break;
311 default:
312 // This can only happen on invalid syntax and with 0 return
313 // we tell the caller we got a syntax error.
314
315 // ASSERT_NOT_REACHED();
316 return 0;
317 }
318 break;
319 }
320 default:
321 ASSERT_NOT_REACHED();
322 return 0;
323 }
324 type = m_lexer.next();
325 }
326
327 if (type == TokError) {
328#ifdef JSONLEXER_DEBUG_VERBOSE
329 fprintf(stderr, "WARNING: JSONParse ending with error!\n");
330#endif
331 return 0;
332 }
333 if (type == TokEnd) {
334#ifdef JSONLEXER_DEBUG_VERBOSE
335 fprintf(stderr, "WARNING: JSONParse ending with unexpected END!\n");
336#endif
337 return 0;
338 }
339 ASSERT_NOT_REACHED();
340 return 0;
341}
342
343
344// ------------------------------ JSONLexer --------------------------------
345
346JSONLexer::JSONLexer(const UString& code)
347 : m_code(code),
348 m_pos(0)
349{
350}
351
352TokenType JSONLexer::current()
353{
354 return m_type;
355}
356
357double JSONLexer::currentNumber() const
358{
359 ASSERT(m_type == TokNumber);
360 return m_numberToken;
361}
362
363UString JSONLexer::currentString() const
364{
365 ASSERT(m_type == TokString);
366 return m_stringToken;
367}
368
369TokenType JSONLexer::lexString()
370{
371 UString string;
372 const int codeSize = m_code.size();
373 //skip first detected '"'
374 ++m_pos;
375
376 if (m_pos >= codeSize) {
377 m_type = TokError;
378 return m_type;
379 }
380
381 //while not at the end of the string '"'
382 while (!(m_code[m_pos] == '"')) {
383 UChar cur = m_code[m_pos];
384 if (cur == UChar('\\')) {
385 ++m_pos;
386 bool error = false;
387 string.append(parseEscapeChar(&error));
388 if (error) {
389 m_type = TokError;
390 return m_type;
391 }
392 } else {
393 if (cur.uc <= InvalidJSONUnicode) {
394 m_type = TokError;
395 return m_type;
396 }
397 string.append(cur);
398 ++m_pos;
399 }
400
401 if (m_pos >= codeSize) {
402 m_type = TokError;
403 return m_type;
404 }
405 }
406
407 m_type = TokString;
408 m_stringToken = string;
409 ++m_pos;
410#ifdef JSONLEXER_DEBUG_VERBOSE
411 fprintf(stderr, "JSONLexer::lexString: Pos:%d stringlength:%d string:%s\n", m_pos, string.size(), string.ascii());
412#endif
413 return m_type;
414}
415
416TokenType JSONLexer::lexNumber()
417{
418 const int start = m_pos;
419 const int codeSize = m_code.size();
420
421 // -?(0 | [1-9][0-9]*) ('.' [0-9]+)? ([eE][+-]? [0-9]+)?
422
423 // -?
424 if (m_pos < codeSize && m_code[m_pos] == '-')
425 ++m_pos;
426
427 // (0 | [1-9][0-9]*)
428 if (m_pos < codeSize && m_code[m_pos] == '0') {
429 ++m_pos;
430 } else if (m_pos < codeSize) {
431 while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) {
432 ++m_pos;
433 }
434 } else {
435 m_type = TokError;
436 return m_type;
437 }
438
439 // ('.' [0-9]+)?
440 if (m_pos < codeSize && m_code[m_pos] == '.') {
441 ++m_pos;
442 // [0-9]+
443 if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) {
444 m_type = TokError;
445 return m_type;
446 }
447 ++m_pos;
448
449 while (m_pos < codeSize && isDecimalDigit(m_code[m_pos]))
450 ++m_pos;
451 }
452
453 // ([eE][+-]? [0-9]+)?
454 if (m_pos < codeSize && (m_code[m_pos] == 'e' || m_code[m_pos] == 'E')) { // [eE]
455 ++m_pos;
456
457 // [-+]?
458 if (m_pos < codeSize && (m_code[m_pos] == '-' || m_code[m_pos] == '+'))
459 ++m_pos;
460
461 // [0-9]+
462 if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) {
463 m_type = TokError;
464 return m_type;
465 }
466
467 ++m_pos;
468 while (m_pos < codeSize && isDecimalDigit(m_code[m_pos]))
469 ++m_pos;
470 }
471
472 m_numberToken = m_code.substr(start, m_pos-start).toDouble(false, false);
473 m_type = TokNumber;
474#ifdef JSONLEXER_DEBUG_VERBOSE
475 fprintf(stderr, "Number: %f\n", m_numberToken);
476#endif
477 return m_type;
478}
479
480UChar JSONLexer::parseEscapeChar(bool* error)
481{
482 UChar cur = m_code[m_pos];
483 switch (cur.uc) {
484 case '"':
485 case '\\':
486 case '/':
487 ++m_pos;
488 return cur;
489 case 'b':
490 ++m_pos;
491 return UChar('\b');
492 case 'f':
493 ++m_pos;
494 return UChar('\f');
495 case 'n':
496 ++m_pos;
497 return UChar('\n');
498 case 'r':
499 ++m_pos;
500 return UChar('\r');
501 case 't':
502 ++m_pos;
503 return UChar('\t');
504 case 'u':
505 {
506 if ((m_code.size() - (m_pos+1)) < 4) {
507 *error = true;
508 return UChar(' ');
509 }
510 if (!isHexDigit(m_code[m_pos+1]) || !isHexDigit(m_code[m_pos+2]) ||
511 !isHexDigit(m_code[m_pos+3]) || !isHexDigit(m_code[m_pos+4])) {
512 *error = true;
513 return UChar(' ');
514 }
515
516 UChar next = Lexer::convertUnicode(m_code[m_pos+1].uc, m_code[m_pos+2].uc, m_code[m_pos+3].uc, m_code[m_pos+4].uc);
517 if (next.uc <= InvalidJSONUnicode)
518 {
519 *error = true;
520 return UChar(' ');
521 }
522
523 *error = false;
524 m_pos += 5;
525 return next;
526 }
527 default:
528 *error = true;
529 return UChar(' ');
530 }
531}
532
533//helper function, checks if "word" is in the "code" at "pos".
534static inline bool isStringSequence(int pos, const UString& code, const UString& word)
535{
536 const int wordSize = word.size();
537 if (pos + wordSize > code.size())
538 return false;
539
540 //Skip first, we already checked it
541 for (int i = 1; i < wordSize; ++i) {
542 if (code[pos+i].uc != word[i].uc)
543 return false;
544 }
545 return true;
546}
547
548TokenType JSONLexer::next()
549{
550 while(true) {
551 if (m_pos >= m_code.size()) {
552 m_type = TokEnd;
553 return m_type;
554 }
555
556 if (!isJSONWhiteSpace(m_code[m_pos])) {
557 break;
558 }
559 ++m_pos;
560 }
561
562 m_type = TokError;
563
564#ifdef JSONLEXER_DEBUG_VERBOSE
565 fprintf(stderr, "JSONLexer::next current: %c \t\t pos: %d/%d\n", char(m_code[m_pos].uc), m_pos, m_code.size());
566#endif
567
568 switch (m_code[m_pos].uc) {
569 case '[':
570 m_type = TokLBracket;
571 ++m_pos;
572 return m_type;
573 case ']':
574 m_type = TokRBracket;
575 ++m_pos;
576 return m_type;
577 case '(':
578 m_type = TokLParen;
579 ++m_pos;
580 return m_type;
581 case ')':
582 m_type = TokRParen;
583 ++m_pos;
584 return m_type;
585 case '{':
586 m_type = TokLBrace;
587 ++m_pos;
588 return m_type;
589 case '}':
590 m_type = TokRBrace;
591 ++m_pos;
592 return m_type;
593 case ',':
594 m_type = TokComma;
595 ++m_pos;
596 return m_type;
597 case ':':
598 m_type = TokColon;
599 ++m_pos;
600 return m_type;
601 case '"':
602 return lexString();
603
604 case 't':
605 if (isStringSequence(m_pos, m_code, "true")) {
606 m_type = TokTrue;
607 m_pos += 4;
608 return m_type;
609 }
610 break;
611 case 'f':
612 if (isStringSequence(m_pos, m_code, "false")) {
613 m_type = TokFalse;
614 m_pos += 5;
615 return m_type;
616 }
617 break;
618 case 'n':
619 if (isStringSequence(m_pos, m_code, "null")) {
620 m_type = TokNull;
621 m_pos += 4;
622 return m_type;
623 }
624 break;
625 case '-':
626 case '0':
627 case '1':
628 case '2':
629 case '3':
630 case '4':
631 case '5':
632 case '6':
633 case '7':
634 case '8':
635 case '9':
636 return lexNumber();
637 }
638 return m_type;
639}
640
641bool JSONParser::nextParseIsEOF()
642{
643 return m_lexer.next() == TokEnd;
644}
645
646
647} // namespace KJS
648