1 | /* |
2 | * This file is part of the KDE libraries |
3 | * Copyright (C) 2012 Bernd Buschinski (b.buschinski@googlemail.com) |
4 | * |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Library General Public |
7 | * License as published by the Free Software Foundation; either |
8 | * version 2 of the License, or (at your option) any later version. |
9 | * |
10 | * This library is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Library General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU Library General Public License |
16 | * along with this library; see the file COPYING.LIB. If not, write to |
17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
18 | * Boston, MA 02110-1301, USA. |
19 | * |
20 | */ |
21 | |
22 | #include "jsonlexer.h" |
23 | |
24 | #include <stack> |
25 | |
26 | #include "lexer.h" |
27 | #include "object.h" |
28 | |
29 | #include "wtf/Assertions.h" |
30 | |
31 | // #define JSONLEXER_DEBUG_VERBOSE |
32 | |
33 | namespace KJS { |
34 | |
35 | using namespace JSONParserState; |
36 | |
37 | static const unsigned short InvalidJSONUnicode = 0x001F; |
38 | |
39 | static inline bool isDecimalDigit(const UChar &c) |
40 | { |
41 | return (c.uc >= '0' && c.uc <= '9'); |
42 | } |
43 | |
44 | static inline bool isHexDigit(const UChar& c) |
45 | { |
46 | return (isDecimalDigit(c) || |
47 | (c.uc >= 'a' && c.uc <= 'f') || |
48 | (c.uc >= 'A' && c.uc <= 'F')); |
49 | } |
50 | |
51 | static inline bool isJSONWhiteSpace(const UChar& c) |
52 | { |
53 | //ECMA Edition 5.1r6 - 15.12.1.1 - Syntax |
54 | switch (c.uc) { |
55 | case 0x0020: //SP |
56 | case 0x0009: //TAB |
57 | case 0x000A: //LF |
58 | case 0x000D: //CR |
59 | return true; |
60 | default: |
61 | return false; |
62 | } |
63 | } |
64 | |
65 | #ifdef JSONLEXER_DEBUG_VERBOSE |
66 | static inline UString tokenToString(TokenType type) |
67 | { |
68 | switch (type) { |
69 | case TokLBracket: return UString("TokLBracket" ); |
70 | case TokRBracket: return UString("TokRBracket" ); |
71 | case TokLBrace: return UString("TokLBrace" ); |
72 | case TokRBrace: return UString("TokRBrace" ); |
73 | case TokString: return UString("TokString" ); |
74 | case TokIdentifier: return UString("TokIdentifier" ); |
75 | case TokNumber: return UString("TokNumber" ); |
76 | case TokColon: return UString("TokColon" ); |
77 | case TokLParen: return UString("TokLParen" ); |
78 | case TokRParen: return UString("TokRParen" ); |
79 | case TokComma: return UString("TokComma" ); |
80 | case TokTrue: return UString("TokTrue" ); |
81 | case TokFalse: return UString("TokFalse" ); |
82 | case TokNull: return UString("TokNull" ); |
83 | case TokEnd: return UString("TokEnd" ); |
84 | case TokError: return UString("TokError" ); |
85 | } |
86 | ASSERT_NOT_REACHED(); |
87 | return UString("Default" ); |
88 | } |
89 | |
90 | static inline UString parserStateToString(ParserState state) |
91 | { |
92 | switch (state) { |
93 | case JSONValue: return UString("JSONValue" ); |
94 | case JSONObject: return UString("JSONObject" ); |
95 | case JSONArray: return UString("JSONArray" ); |
96 | } |
97 | ASSERT_NOT_REACHED(); |
98 | return UString("Default" ); |
99 | } |
100 | #endif |
101 | |
102 | |
103 | // ------------------------------ JSONParser -------------------------------- |
104 | |
105 | JSONParser::JSONParser(const UString& code) |
106 | : m_lexer(code) |
107 | { |
108 | #ifdef JSONLEXER_DEBUG_VERBOSE |
109 | fprintf(stderr, "=============== new JSONParser ===============\n%s\n===============\n" , code.ascii()); |
110 | #endif |
111 | } |
112 | |
113 | JSValue* JSONParser::tryParse(ExecState* exec) |
114 | { |
115 | JSValue* ret = parse(exec); |
116 | // If the syntax is correct, we never see the EOF, the last used token may be '}'. |
117 | // But Syntax like "{} xyz" is also invalid, so we have to check if the next(last) token is EOF |
118 | if (ret && nextParseIsEOF()) |
119 | return ret; |
120 | return 0; |
121 | } |
122 | |
123 | // helper function for adding a value to the object. |
124 | // the arrayStack saves all added values and gives the correct array position. |
125 | // This function will return false for NULL value or on exception. |
126 | static inline bool addArrayItem(ExecState* exec, std::stack<JSValue*>* arrayStack, JSValue* value, JSObject* object) |
127 | { |
128 | if (exec->hadException()) |
129 | return false; |
130 | |
131 | if (!value) |
132 | return false; |
133 | |
134 | arrayStack->push(value); |
135 | object->put(exec, arrayStack->size()-1, value); |
136 | return true; |
137 | } |
138 | |
139 | JSValue* JSONParser::parse(ExecState* exec, ParserState state) |
140 | { |
141 | if (exec->hadException()) |
142 | return 0; |
143 | |
144 | ParserState tState = state; |
145 | TokenType type = m_lexer.next(); |
146 | |
147 | JSObject* object = 0; |
148 | std::stack<JSValue*> arrayObjectStack; |
149 | UString propertyName; |
150 | |
151 | // For parsing the Object, did we found a propertyName? |
152 | // NOTE: empty propertynames are allowed. |
153 | bool havePropertyName = false; |
154 | // For parsing the Object/Array, checks if we really added/found a propertyName |
155 | // before we find the comma ',' |
156 | bool propAdded = false; |
157 | // For parsing the Array, remember if last found token is Comma |
158 | bool lastFoundIsTokComma = false; |
159 | |
160 | while (type != TokEnd && type != TokError) { |
161 | #ifdef JSONLEXER_DEBUG_VERBOSE |
162 | fprintf(stderr, "TokenType: %s \t State: %s\n" , tokenToString(type).ascii(), parserStateToString(tState).ascii()); |
163 | #endif |
164 | |
165 | switch (tState) { |
166 | case JSONValue: |
167 | switch (type) { |
168 | case TokLBracket: |
169 | object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty())); |
170 | tState = JSONArray; |
171 | break; |
172 | case TokLBrace: |
173 | object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty())); |
174 | tState = JSONObject; |
175 | break; |
176 | case TokString: |
177 | return jsString(m_lexer.currentString()); |
178 | case TokNull: |
179 | return jsNull(); |
180 | case TokTrue: |
181 | return jsBoolean(true); |
182 | case TokFalse: |
183 | return jsBoolean(false); |
184 | case TokNumber: |
185 | return jsNumber(m_lexer.currentNumber()); |
186 | default: |
187 | // This can only happen on invalid syntax and with 0 return |
188 | // we tell the caller we got a syntax error. |
189 | |
190 | // ASSERT_NOT_REACHED(); |
191 | return 0; |
192 | } |
193 | break; |
194 | case JSONObject: { |
195 | // if we got called from JSONArray-TokLBrace we did not create an object. |
196 | |
197 | // In more detail for the following JSON String "[{}]" |
198 | // If we are in parse with type=JSONArray and state=TokLBrace, |
199 | // means we just found the "{" in the Array, and call parse(exec, JSONObject), |
200 | // now in this call type=JSONObject, state=TokRBrace ("}") and, our new, object=0 (!) |
201 | // We will finish the object and return it, but as object is null, we return 0. |
202 | // which would be wrong, as empty objects are allowed. |
203 | // In this case we just report invalid data. |
204 | // But for JSON String like "[{"a":1}]", we end up using object(0)->putDirect |
205 | // and crash. |
206 | |
207 | // In short, remove this line and we will crash. |
208 | object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty())); |
209 | switch (type) { |
210 | case TokString: // PropertyName |
211 | if (havePropertyName) |
212 | return 0; |
213 | propertyName = m_lexer.currentString(); |
214 | havePropertyName = true; |
215 | break; |
216 | case TokColon: { |
217 | if (!havePropertyName) |
218 | return 0; |
219 | JSValue* val = parse(exec, JSONValue); |
220 | if (!val) |
221 | return 0; |
222 | |
223 | // use putDirect to by-pass __proto__ |
224 | object->putDirect(Identifier(propertyName), val); |
225 | propertyName = "" ; |
226 | havePropertyName = false; |
227 | propAdded = true; |
228 | break; |
229 | } |
230 | case TokRBrace: //Finish Object |
231 | if (havePropertyName) |
232 | return 0; |
233 | return object; |
234 | case TokComma: // Next Property |
235 | if (!propAdded) |
236 | return 0; |
237 | propAdded = false; |
238 | break; |
239 | default: |
240 | // This can only happen on invalid syntax and with 0 return |
241 | // we tell the caller we got a syntax error. |
242 | |
243 | // ASSERT_NOT_REACHED(); |
244 | return 0; |
245 | } |
246 | break; |
247 | } |
248 | case JSONArray: { |
249 | // if we got called from JSONArray-TokLBracket we did not create an object |
250 | object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty())); |
251 | |
252 | // Check for invalid Array syntax, like ["1" "2"] |
253 | switch (type) { |
254 | case TokNumber: |
255 | case TokString: |
256 | case TokNull: |
257 | case TokTrue: |
258 | case TokFalse: |
259 | case TokLBrace: |
260 | case TokLBracket: |
261 | if (propAdded) |
262 | return 0; |
263 | propAdded = true; |
264 | lastFoundIsTokComma = false; |
265 | break; |
266 | default: |
267 | break; |
268 | } |
269 | |
270 | switch (type) { |
271 | case TokRBracket: // Finish array |
272 | // Check for invalid syntax like "[1,]" |
273 | if (lastFoundIsTokComma) |
274 | return 0; |
275 | return object; |
276 | case TokNumber: |
277 | if (!addArrayItem(exec, &arrayObjectStack, jsNumber(m_lexer.currentNumber()), object)) |
278 | return 0; |
279 | break; |
280 | case TokString: |
281 | if (!addArrayItem(exec, &arrayObjectStack, jsString(m_lexer.currentString()), object)) |
282 | return 0; |
283 | break; |
284 | case TokNull: |
285 | if (!addArrayItem(exec, &arrayObjectStack, jsNull(), object)) |
286 | return 0; |
287 | break; |
288 | case TokTrue: |
289 | if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(true), object)) |
290 | return 0; |
291 | break; |
292 | case TokFalse: |
293 | if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(false), object)) |
294 | return 0; |
295 | break; |
296 | case TokLBrace: |
297 | if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONObject), object)) |
298 | return 0; |
299 | break; |
300 | case TokLBracket: |
301 | if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONArray), object)) |
302 | return 0; |
303 | break; |
304 | case TokComma: // Skip Comma and parse next Array Element |
305 | // if we found a comma without a leading property, this is invalid syntax |
306 | if (!propAdded) |
307 | return 0; |
308 | propAdded = false; |
309 | lastFoundIsTokComma = true; |
310 | break; |
311 | default: |
312 | // This can only happen on invalid syntax and with 0 return |
313 | // we tell the caller we got a syntax error. |
314 | |
315 | // ASSERT_NOT_REACHED(); |
316 | return 0; |
317 | } |
318 | break; |
319 | } |
320 | default: |
321 | ASSERT_NOT_REACHED(); |
322 | return 0; |
323 | } |
324 | type = m_lexer.next(); |
325 | } |
326 | |
327 | if (type == TokError) { |
328 | #ifdef JSONLEXER_DEBUG_VERBOSE |
329 | fprintf(stderr, "WARNING: JSONParse ending with error!\n" ); |
330 | #endif |
331 | return 0; |
332 | } |
333 | if (type == TokEnd) { |
334 | #ifdef JSONLEXER_DEBUG_VERBOSE |
335 | fprintf(stderr, "WARNING: JSONParse ending with unexpected END!\n" ); |
336 | #endif |
337 | return 0; |
338 | } |
339 | ASSERT_NOT_REACHED(); |
340 | return 0; |
341 | } |
342 | |
343 | |
344 | // ------------------------------ JSONLexer -------------------------------- |
345 | |
346 | JSONLexer::JSONLexer(const UString& code) |
347 | : m_code(code), |
348 | m_pos(0) |
349 | { |
350 | } |
351 | |
352 | TokenType JSONLexer::current() |
353 | { |
354 | return m_type; |
355 | } |
356 | |
357 | double JSONLexer::currentNumber() const |
358 | { |
359 | ASSERT(m_type == TokNumber); |
360 | return m_numberToken; |
361 | } |
362 | |
363 | UString JSONLexer::currentString() const |
364 | { |
365 | ASSERT(m_type == TokString); |
366 | return m_stringToken; |
367 | } |
368 | |
369 | TokenType JSONLexer::lexString() |
370 | { |
371 | UString string; |
372 | const int codeSize = m_code.size(); |
373 | //skip first detected '"' |
374 | ++m_pos; |
375 | |
376 | if (m_pos >= codeSize) { |
377 | m_type = TokError; |
378 | return m_type; |
379 | } |
380 | |
381 | //while not at the end of the string '"' |
382 | while (!(m_code[m_pos] == '"')) { |
383 | UChar cur = m_code[m_pos]; |
384 | if (cur == UChar('\\')) { |
385 | ++m_pos; |
386 | bool error = false; |
387 | string.append(parseEscapeChar(&error)); |
388 | if (error) { |
389 | m_type = TokError; |
390 | return m_type; |
391 | } |
392 | } else { |
393 | if (cur.uc <= InvalidJSONUnicode) { |
394 | m_type = TokError; |
395 | return m_type; |
396 | } |
397 | string.append(cur); |
398 | ++m_pos; |
399 | } |
400 | |
401 | if (m_pos >= codeSize) { |
402 | m_type = TokError; |
403 | return m_type; |
404 | } |
405 | } |
406 | |
407 | m_type = TokString; |
408 | m_stringToken = string; |
409 | ++m_pos; |
410 | #ifdef JSONLEXER_DEBUG_VERBOSE |
411 | fprintf(stderr, "JSONLexer::lexString: Pos:%d stringlength:%d string:%s\n" , m_pos, string.size(), string.ascii()); |
412 | #endif |
413 | return m_type; |
414 | } |
415 | |
416 | TokenType JSONLexer::lexNumber() |
417 | { |
418 | const int start = m_pos; |
419 | const int codeSize = m_code.size(); |
420 | |
421 | // -?(0 | [1-9][0-9]*) ('.' [0-9]+)? ([eE][+-]? [0-9]+)? |
422 | |
423 | // -? |
424 | if (m_pos < codeSize && m_code[m_pos] == '-') |
425 | ++m_pos; |
426 | |
427 | // (0 | [1-9][0-9]*) |
428 | if (m_pos < codeSize && m_code[m_pos] == '0') { |
429 | ++m_pos; |
430 | } else if (m_pos < codeSize) { |
431 | while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) { |
432 | ++m_pos; |
433 | } |
434 | } else { |
435 | m_type = TokError; |
436 | return m_type; |
437 | } |
438 | |
439 | // ('.' [0-9]+)? |
440 | if (m_pos < codeSize && m_code[m_pos] == '.') { |
441 | ++m_pos; |
442 | // [0-9]+ |
443 | if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) { |
444 | m_type = TokError; |
445 | return m_type; |
446 | } |
447 | ++m_pos; |
448 | |
449 | while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) |
450 | ++m_pos; |
451 | } |
452 | |
453 | // ([eE][+-]? [0-9]+)? |
454 | if (m_pos < codeSize && (m_code[m_pos] == 'e' || m_code[m_pos] == 'E')) { // [eE] |
455 | ++m_pos; |
456 | |
457 | // [-+]? |
458 | if (m_pos < codeSize && (m_code[m_pos] == '-' || m_code[m_pos] == '+')) |
459 | ++m_pos; |
460 | |
461 | // [0-9]+ |
462 | if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) { |
463 | m_type = TokError; |
464 | return m_type; |
465 | } |
466 | |
467 | ++m_pos; |
468 | while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) |
469 | ++m_pos; |
470 | } |
471 | |
472 | m_numberToken = m_code.substr(start, m_pos-start).toDouble(false, false); |
473 | m_type = TokNumber; |
474 | #ifdef JSONLEXER_DEBUG_VERBOSE |
475 | fprintf(stderr, "Number: %f\n" , m_numberToken); |
476 | #endif |
477 | return m_type; |
478 | } |
479 | |
480 | UChar JSONLexer::parseEscapeChar(bool* error) |
481 | { |
482 | UChar cur = m_code[m_pos]; |
483 | switch (cur.uc) { |
484 | case '"': |
485 | case '\\': |
486 | case '/': |
487 | ++m_pos; |
488 | return cur; |
489 | case 'b': |
490 | ++m_pos; |
491 | return UChar('\b'); |
492 | case 'f': |
493 | ++m_pos; |
494 | return UChar('\f'); |
495 | case 'n': |
496 | ++m_pos; |
497 | return UChar('\n'); |
498 | case 'r': |
499 | ++m_pos; |
500 | return UChar('\r'); |
501 | case 't': |
502 | ++m_pos; |
503 | return UChar('\t'); |
504 | case 'u': |
505 | { |
506 | if ((m_code.size() - (m_pos+1)) < 4) { |
507 | *error = true; |
508 | return UChar(' '); |
509 | } |
510 | if (!isHexDigit(m_code[m_pos+1]) || !isHexDigit(m_code[m_pos+2]) || |
511 | !isHexDigit(m_code[m_pos+3]) || !isHexDigit(m_code[m_pos+4])) { |
512 | *error = true; |
513 | return UChar(' '); |
514 | } |
515 | |
516 | UChar next = Lexer::convertUnicode(m_code[m_pos+1].uc, m_code[m_pos+2].uc, m_code[m_pos+3].uc, m_code[m_pos+4].uc); |
517 | if (next.uc <= InvalidJSONUnicode) |
518 | { |
519 | *error = true; |
520 | return UChar(' '); |
521 | } |
522 | |
523 | *error = false; |
524 | m_pos += 5; |
525 | return next; |
526 | } |
527 | default: |
528 | *error = true; |
529 | return UChar(' '); |
530 | } |
531 | } |
532 | |
533 | //helper function, checks if "word" is in the "code" at "pos". |
534 | static inline bool isStringSequence(int pos, const UString& code, const UString& word) |
535 | { |
536 | const int wordSize = word.size(); |
537 | if (pos + wordSize > code.size()) |
538 | return false; |
539 | |
540 | //Skip first, we already checked it |
541 | for (int i = 1; i < wordSize; ++i) { |
542 | if (code[pos+i].uc != word[i].uc) |
543 | return false; |
544 | } |
545 | return true; |
546 | } |
547 | |
548 | TokenType JSONLexer::next() |
549 | { |
550 | while(true) { |
551 | if (m_pos >= m_code.size()) { |
552 | m_type = TokEnd; |
553 | return m_type; |
554 | } |
555 | |
556 | if (!isJSONWhiteSpace(m_code[m_pos])) { |
557 | break; |
558 | } |
559 | ++m_pos; |
560 | } |
561 | |
562 | m_type = TokError; |
563 | |
564 | #ifdef JSONLEXER_DEBUG_VERBOSE |
565 | fprintf(stderr, "JSONLexer::next current: %c \t\t pos: %d/%d\n" , char(m_code[m_pos].uc), m_pos, m_code.size()); |
566 | #endif |
567 | |
568 | switch (m_code[m_pos].uc) { |
569 | case '[': |
570 | m_type = TokLBracket; |
571 | ++m_pos; |
572 | return m_type; |
573 | case ']': |
574 | m_type = TokRBracket; |
575 | ++m_pos; |
576 | return m_type; |
577 | case '(': |
578 | m_type = TokLParen; |
579 | ++m_pos; |
580 | return m_type; |
581 | case ')': |
582 | m_type = TokRParen; |
583 | ++m_pos; |
584 | return m_type; |
585 | case '{': |
586 | m_type = TokLBrace; |
587 | ++m_pos; |
588 | return m_type; |
589 | case '}': |
590 | m_type = TokRBrace; |
591 | ++m_pos; |
592 | return m_type; |
593 | case ',': |
594 | m_type = TokComma; |
595 | ++m_pos; |
596 | return m_type; |
597 | case ':': |
598 | m_type = TokColon; |
599 | ++m_pos; |
600 | return m_type; |
601 | case '"': |
602 | return lexString(); |
603 | |
604 | case 't': |
605 | if (isStringSequence(m_pos, m_code, "true" )) { |
606 | m_type = TokTrue; |
607 | m_pos += 4; |
608 | return m_type; |
609 | } |
610 | break; |
611 | case 'f': |
612 | if (isStringSequence(m_pos, m_code, "false" )) { |
613 | m_type = TokFalse; |
614 | m_pos += 5; |
615 | return m_type; |
616 | } |
617 | break; |
618 | case 'n': |
619 | if (isStringSequence(m_pos, m_code, "null" )) { |
620 | m_type = TokNull; |
621 | m_pos += 4; |
622 | return m_type; |
623 | } |
624 | break; |
625 | case '-': |
626 | case '0': |
627 | case '1': |
628 | case '2': |
629 | case '3': |
630 | case '4': |
631 | case '5': |
632 | case '6': |
633 | case '7': |
634 | case '8': |
635 | case '9': |
636 | return lexNumber(); |
637 | } |
638 | return m_type; |
639 | } |
640 | |
641 | bool JSONParser::nextParseIsEOF() |
642 | { |
643 | return m_lexer.next() == TokEnd; |
644 | } |
645 | |
646 | |
647 | } // namespace KJS |
648 | |