1 | // -*- c-basic-offset: 2 -*- |
2 | /* |
3 | * This file is part of the KDE libraries |
4 | * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) |
5 | * Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved. |
6 | * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca) |
7 | * |
8 | * This library is free software; you can redistribute it and/or |
9 | * modify it under the terms of the GNU Library General Public |
10 | * License as published by the Free Software Foundation; either |
11 | * version 2 of the License, or (at your option) any later version. |
12 | * |
13 | * This library is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | * Library General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU Library General Public License |
19 | * along with this library; see the file COPYING.LIB. If not, write to |
20 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
21 | * Boston, MA 02110-1301, USA. |
22 | * |
23 | */ |
24 | |
25 | #include "ustring.h" |
26 | #include <config-kjs.h> |
27 | |
28 | #include <assert.h> |
29 | #include <stdlib.h> |
30 | #include <stdio.h> |
31 | #include "wtf/DisallowCType.h" |
32 | #include "wtf/ASCIICType.h" |
33 | #if HAVE(STRING_H) |
34 | #include <string.h> |
35 | #endif |
36 | #if HAVE(STRINGS_H) |
37 | #include <strings.h> |
38 | #endif |
39 | #include <limits.h> |
40 | |
41 | #include "operations.h" |
42 | #include "function.h" |
43 | #include "identifier.h" |
44 | #include <math.h> |
45 | #include "dtoa.h" |
46 | #include "collector.h" |
47 | #include "commonunicode.h" |
48 | |
49 | #include <wtf/Vector.h> |
50 | |
51 | using std::max; |
52 | |
53 | // GCC cstring uses these automatically, but not all implementations do. |
54 | using std::strlen; |
55 | using std::strcpy; |
56 | using std::strncpy; |
57 | using std::memset; |
58 | using std::memcpy; |
59 | |
60 | using namespace WTF; |
61 | |
62 | namespace KJS { |
63 | |
64 | extern const double NaN; |
65 | extern const double Inf; |
66 | |
67 | static inline size_t overflowIndicator() { return std::numeric_limits<size_t>::max(); } |
68 | static inline size_t maxUChars() { return std::numeric_limits<size_t>::max() / sizeof(UChar); } |
69 | |
70 | static inline UChar* allocChars(size_t length) |
71 | { |
72 | assert(length); |
73 | if (length > maxUChars()) |
74 | return 0; |
75 | return static_cast<UChar*>(fastMalloc(sizeof(UChar) * length)); |
76 | } |
77 | |
78 | static inline UChar* reallocChars(UChar* buffer, size_t length) |
79 | { |
80 | ASSERT(length); |
81 | if (length > maxUChars()) |
82 | return 0; |
83 | return static_cast<UChar*>(fastRealloc(buffer, sizeof(UChar) * length)); |
84 | } |
85 | |
86 | CString::CString(const char *c) |
87 | { |
88 | length = strlen(c); |
89 | data = new char[length+1]; |
90 | memcpy(data, c, length + 1); |
91 | } |
92 | |
93 | CString::CString(const char *c, size_t len) |
94 | { |
95 | length = len; |
96 | data = new char[len+1]; |
97 | memcpy(data, c, len); |
98 | data[len] = 0; |
99 | } |
100 | |
101 | CString::CString(const CString &b) |
102 | { |
103 | length = b.length; |
104 | if (length > 0 && b.data) { |
105 | data = new char[length+1]; |
106 | memcpy(data, b.data, length + 1); |
107 | } |
108 | else |
109 | data = 0; |
110 | } |
111 | |
112 | CString::~CString() |
113 | { |
114 | delete [] data; |
115 | } |
116 | |
117 | CString &CString::operator=(const char *c) |
118 | { |
119 | if (data) |
120 | delete [] data; |
121 | length = strlen(c); |
122 | data = new char[length+1]; |
123 | memcpy(data, c, length + 1); |
124 | |
125 | return *this; |
126 | } |
127 | |
128 | CString &CString::operator=(const CString &str) |
129 | { |
130 | if (this == &str) |
131 | return *this; |
132 | |
133 | if (data) |
134 | delete [] data; |
135 | length = str.length; |
136 | if (str.data) { |
137 | data = new char[length + 1]; |
138 | memcpy(data, str.data, length + 1); |
139 | } |
140 | else |
141 | data = 0; |
142 | |
143 | return *this; |
144 | } |
145 | |
146 | bool operator==(const CString& c1, const CString& c2) |
147 | { |
148 | size_t len = c1.size(); |
149 | return len == c2.size() && (len == 0 || memcmp(c1.c_str(), c2.c_str(), len) == 0); |
150 | } |
151 | |
152 | // Hack here to avoid a global with a constructor; point to an unsigned short instead of a UChar. |
153 | static unsigned short almostUChar; |
154 | UString::Rep UString::Rep::null = { 0, 0, 1, 0, 0, &UString::Rep::null, 0, 0, 0, 0, 0, 0 }; |
155 | UString::Rep UString::Rep::empty = { 0, 0, 1, 0, 0, &UString::Rep::empty, 0, reinterpret_cast<UChar*>(&almostUChar), 0, 0, 0, 0 }; |
156 | const int normalStatBufferSize = 4096; |
157 | static char *statBuffer = 0; // FIXME: This buffer is never deallocated. |
158 | static int statBufferSize = 0; |
159 | |
160 | PassRefPtr<UString::Rep> UString::Rep::createCopying (const UChar* d, int length) |
161 | { |
162 | UChar* copyD = allocChars(length); |
163 | memcpy(copyD, d, length * sizeof(UChar)); |
164 | |
165 | return create(copyD, length); |
166 | } |
167 | |
168 | PassRefPtr<UString::Rep> UString::Rep::create(UChar *d, int l) |
169 | { |
170 | Rep* r = new Rep; |
171 | r->offset = 0; |
172 | r->len = l; |
173 | r->rc = 1; |
174 | r->_hash = 0; |
175 | r->isIdentifier = 0; |
176 | r->baseString = r; |
177 | r->reportedCost = 0; |
178 | r->buf = d; |
179 | r->usedCapacity = l; |
180 | r->capacity = l; |
181 | r->usedPreCapacity = 0; |
182 | r->preCapacity = 0; |
183 | |
184 | // steal the single reference this Rep was created with |
185 | return adoptRef(r); |
186 | } |
187 | |
188 | PassRefPtr<UString::Rep> UString::Rep::create(PassRefPtr<Rep> base, int offset, int length) |
189 | { |
190 | assert(base); |
191 | |
192 | int baseOffset = base->offset; |
193 | |
194 | base = base->baseString; |
195 | |
196 | assert(-(offset + baseOffset) <= base->usedPreCapacity); |
197 | assert(offset + baseOffset + length <= base->usedCapacity); |
198 | |
199 | Rep* r = new Rep; |
200 | r->offset = baseOffset + offset; |
201 | r->len = length; |
202 | r->rc = 1; |
203 | r->_hash = 0; |
204 | r->isIdentifier = 0; |
205 | r->baseString = base.releaseRef(); |
206 | r->reportedCost = 0; |
207 | r->buf = 0; |
208 | r->usedCapacity = 0; |
209 | r->capacity = 0; |
210 | r->usedPreCapacity = 0; |
211 | r->preCapacity = 0; |
212 | |
213 | // steal the single reference this Rep was created with |
214 | return adoptRef(r); |
215 | } |
216 | |
217 | void UString::Rep::destroy() |
218 | { |
219 | if (isIdentifier) |
220 | Identifier::remove(this); |
221 | if (baseString != this) { |
222 | baseString->deref(); |
223 | } else { |
224 | fastFree(buf); |
225 | } |
226 | delete this; |
227 | } |
228 | |
229 | // Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's |
230 | // or anything like that. |
231 | const unsigned PHI = 0x9e3779b9U; |
232 | |
233 | // Paul Hsieh's SuperFastHash |
234 | // http://www.azillionmonkeys.com/qed/hash.html |
235 | unsigned UString::Rep::computeHash(const UChar *s, int len) |
236 | { |
237 | unsigned l = len; |
238 | uint32_t hash = PHI; |
239 | uint32_t tmp; |
240 | |
241 | int rem = l & 1; |
242 | l >>= 1; |
243 | |
244 | // Main loop |
245 | for (; l > 0; l--) { |
246 | hash += s[0].uc; |
247 | tmp = (s[1].uc << 11) ^ hash; |
248 | hash = (hash << 16) ^ tmp; |
249 | s += 2; |
250 | hash += hash >> 11; |
251 | } |
252 | |
253 | // Handle end case |
254 | if (rem) { |
255 | hash += s[0].uc; |
256 | hash ^= hash << 11; |
257 | hash += hash >> 17; |
258 | } |
259 | |
260 | // Force "avalanching" of final 127 bits |
261 | hash ^= hash << 3; |
262 | hash += hash >> 5; |
263 | hash ^= hash << 2; |
264 | hash += hash >> 15; |
265 | hash ^= hash << 10; |
266 | |
267 | // this avoids ever returning a hash code of 0, since that is used to |
268 | // signal "hash not computed yet", using a value that is likely to be |
269 | // effectively the same as 0 when the low bits are masked |
270 | if (hash == 0) |
271 | hash = 0x80000000; |
272 | |
273 | return hash; |
274 | } |
275 | |
276 | // Paul Hsieh's SuperFastHash |
277 | // http://www.azillionmonkeys.com/qed/hash.html |
278 | unsigned UString::Rep::computeHash(const char* s, int len) |
279 | { |
280 | // This hash is designed to work on 16-bit chunks at a time. But since the normal case |
281 | // (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they |
282 | // were 16-bit chunks, which should give matching results |
283 | |
284 | uint32_t hash = PHI; |
285 | uint32_t tmp; |
286 | unsigned l = len; |
287 | |
288 | int rem = l & 1; |
289 | l >>= 1; |
290 | |
291 | // Main loop |
292 | for (; l > 0; l--) { |
293 | hash += (unsigned char)s[0]; |
294 | tmp = ((unsigned char)s[1] << 11) ^ hash; |
295 | hash = (hash << 16) ^ tmp; |
296 | s += 2; |
297 | hash += hash >> 11; |
298 | } |
299 | |
300 | // Handle end case |
301 | if (rem) { |
302 | hash += (unsigned char)s[0]; |
303 | hash ^= hash << 11; |
304 | hash += hash >> 17; |
305 | } |
306 | |
307 | // Force "avalanching" of final 127 bits |
308 | hash ^= hash << 3; |
309 | hash += hash >> 5; |
310 | hash ^= hash << 2; |
311 | hash += hash >> 15; |
312 | hash ^= hash << 10; |
313 | |
314 | // this avoids ever returning a hash code of 0, since that is used to |
315 | // signal "hash not computed yet", using a value that is likely to be |
316 | // effectively the same as 0 when the low bits are masked |
317 | if (hash == 0) |
318 | hash = 0x80000000; |
319 | |
320 | return hash; |
321 | } |
322 | |
323 | unsigned UString::Rep::computeHash(const char* s) |
324 | { |
325 | return computeHash(s, strlen(s)); |
326 | } |
327 | |
328 | // put these early so they can be inlined |
329 | inline size_t UString::expandedSize(size_t size, size_t otherSize) const |
330 | { |
331 | // Do the size calculation in two parts, returning overflowIndicator if |
332 | // we overflow the maximum value that we can handle. |
333 | |
334 | if (size > maxUChars()) |
335 | return overflowIndicator(); |
336 | |
337 | size_t expandedSize = ((size + 10) / 10 * 11) + 1; |
338 | if (maxUChars() - expandedSize < otherSize) |
339 | return overflowIndicator(); |
340 | |
341 | return expandedSize + otherSize; |
342 | } |
343 | |
344 | inline int UString::usedCapacity() const |
345 | { |
346 | return m_rep->baseString->usedCapacity; |
347 | } |
348 | |
349 | inline int UString::usedPreCapacity() const |
350 | { |
351 | return m_rep->baseString->usedPreCapacity; |
352 | } |
353 | |
354 | void UString::expandCapacity(int requiredLength) |
355 | { |
356 | Rep* r = m_rep->baseString; |
357 | |
358 | if (requiredLength > r->capacity) { |
359 | size_t newCapacity = expandedSize(requiredLength, r->preCapacity); |
360 | UChar* oldBuf = r->buf; |
361 | r->buf = reallocChars(r->buf, newCapacity); |
362 | if (!r->buf) { |
363 | r->buf = oldBuf; |
364 | m_rep = &Rep::null; |
365 | return; |
366 | } |
367 | r->capacity = newCapacity - r->preCapacity; |
368 | } |
369 | if (requiredLength > r->usedCapacity) { |
370 | r->usedCapacity = requiredLength; |
371 | } |
372 | } |
373 | |
374 | void UString::expandPreCapacity(int requiredPreCap) |
375 | { |
376 | Rep* r = m_rep->baseString; |
377 | |
378 | if (requiredPreCap > r->preCapacity) { |
379 | size_t newCapacity = expandedSize(requiredPreCap, r->capacity); |
380 | int delta = newCapacity - r->capacity - r->preCapacity; |
381 | |
382 | UChar* newBuf = allocChars(newCapacity); |
383 | if (!newBuf) { |
384 | m_rep = &Rep::null; |
385 | return; |
386 | } |
387 | memcpy(newBuf + delta, r->buf, (r->capacity + r->preCapacity) * sizeof(UChar)); |
388 | fastFree(r->buf); |
389 | r->buf = newBuf; |
390 | |
391 | r->preCapacity = newCapacity - r->capacity; |
392 | } |
393 | if (requiredPreCap > r->usedPreCapacity) { |
394 | r->usedPreCapacity = requiredPreCap; |
395 | } |
396 | } |
397 | |
398 | |
399 | UString::UString(Empty) |
400 | : m_rep(&Rep::empty) |
401 | { |
402 | } |
403 | |
404 | UString::UString(char c) |
405 | : m_rep(Rep::create(allocChars(1), 1)) |
406 | { |
407 | m_rep->buf[0] = static_cast<unsigned char>(c); |
408 | } |
409 | |
410 | UString::UString(const char* c) |
411 | { |
412 | if (!c) { |
413 | m_rep = &Rep::null; |
414 | return; |
415 | } |
416 | |
417 | if (!c[0]) { |
418 | m_rep = &Rep::empty; |
419 | return; |
420 | } |
421 | |
422 | size_t length = strlen(c); |
423 | UChar *d = allocChars(length); |
424 | if (!d) |
425 | m_rep = &Rep::null; |
426 | else { |
427 | for (size_t i = 0; i < length; i++) |
428 | d[i].uc = c[i]; |
429 | m_rep = Rep::create(d, static_cast<int>(length)); |
430 | } |
431 | } |
432 | |
433 | UString::UString(const char* c, size_t length) |
434 | { |
435 | if (!c) { |
436 | m_rep = &Rep::null; |
437 | return; |
438 | } |
439 | |
440 | if (length == 0) { |
441 | m_rep = &Rep::empty; |
442 | return; |
443 | } |
444 | |
445 | UChar* d = allocChars(length); |
446 | if (!d) |
447 | m_rep = &Rep::null; |
448 | else { |
449 | for (size_t i = 0; i < length; i++) |
450 | d[i].uc = c[i]; |
451 | m_rep = Rep::create(d, static_cast<int>(length)); |
452 | } |
453 | } |
454 | |
455 | UString::UString(const UChar* c, int length) |
456 | { |
457 | if (length == 0) |
458 | m_rep = &Rep::empty; |
459 | else |
460 | m_rep = Rep::createCopying(c, length); |
461 | } |
462 | |
463 | UString::UString(UChar* c, int length, bool copy) |
464 | { |
465 | if (length == 0) |
466 | m_rep = &Rep::empty; |
467 | else if (copy) |
468 | m_rep = Rep::createCopying(c, length); |
469 | else |
470 | m_rep = Rep::create(c, length); |
471 | } |
472 | |
473 | UString::UString(const Vector<UChar>& buffer) |
474 | { |
475 | if (!buffer.size()) |
476 | m_rep = &Rep::empty; |
477 | else |
478 | m_rep = Rep::createCopying(buffer.data(), buffer.size()); |
479 | } |
480 | |
481 | |
482 | UString::UString(const UString &a, const UString &b) |
483 | { |
484 | int aSize = a.size(); |
485 | int aOffset = a.m_rep->offset; |
486 | int bSize = b.size(); |
487 | int bOffset = b.m_rep->offset; |
488 | int length = aSize + bSize; |
489 | |
490 | // possible cases: |
491 | |
492 | if (aSize == 0) { |
493 | // a is empty |
494 | m_rep = b.m_rep; |
495 | } else if (bSize == 0) { |
496 | // b is empty |
497 | m_rep = a.m_rep; |
498 | } else if (aOffset + aSize == a.usedCapacity() && aSize >= minShareSize && 4 * aSize >= bSize && |
499 | (-bOffset != b.usedPreCapacity() || aSize >= bSize)) { |
500 | // - a reaches the end of its buffer so it qualifies for shared append |
501 | // - also, it's at least a quarter the length of b - appending to a much shorter |
502 | // string does more harm than good |
503 | // - however, if b qualifies for prepend and is longer than a, we'd rather prepend |
504 | UString x(a); |
505 | x.expandCapacity(aOffset + length); |
506 | if (a.data() && x.data()) { |
507 | memcpy(const_cast<UChar *>(a.data() + aSize), b.data(), bSize * sizeof(UChar)); |
508 | m_rep = Rep::create(a.m_rep, 0, length); |
509 | } else |
510 | m_rep = &Rep::null; |
511 | } else if (-bOffset == b.usedPreCapacity() && bSize >= minShareSize && 4 * bSize >= aSize) { |
512 | // - b reaches the beginning of its buffer so it qualifies for shared prepend |
513 | // - also, it's at least a quarter the length of a - prepending to a much shorter |
514 | // string does more harm than good |
515 | UString y(b); |
516 | y.expandPreCapacity(-bOffset + aSize); |
517 | if (b.data() && y.data()) { |
518 | memcpy(const_cast<UChar *>(b.data() - aSize), a.data(), aSize * sizeof(UChar)); |
519 | m_rep = Rep::create(b.m_rep, -aSize, length); |
520 | } else |
521 | m_rep = &Rep::null; |
522 | } else { |
523 | // a does not qualify for append, and b does not qualify for prepend, gotta make a whole new string |
524 | size_t newCapacity = expandedSize(length, 0); |
525 | UChar* d = allocChars(newCapacity); |
526 | if (!d) |
527 | m_rep = &Rep::null; |
528 | else { |
529 | memcpy(d, a.data(), aSize * sizeof(UChar)); |
530 | memcpy(d + aSize, b.data(), bSize * sizeof(UChar)); |
531 | m_rep = Rep::create(d, length); |
532 | m_rep->capacity = newCapacity; |
533 | } |
534 | } |
535 | } |
536 | |
537 | const UString &UString::null() |
538 | { |
539 | static UString* n = new UString; |
540 | return *n; |
541 | } |
542 | |
543 | UString UString::from(int i) |
544 | { |
545 | UChar buf[1 + sizeof(i) * 3]; |
546 | UChar *end = buf + sizeof(buf) / sizeof(UChar); |
547 | UChar *p = end; |
548 | |
549 | if (i == 0) { |
550 | *--p = '0'; |
551 | } else if (i == INT_MIN) { |
552 | char minBuf[1 + sizeof(i) * 3]; |
553 | sprintf(minBuf, "%d" , INT_MIN); |
554 | return UString(minBuf); |
555 | } else { |
556 | bool negative = false; |
557 | if (i < 0) { |
558 | negative = true; |
559 | i = -i; |
560 | } |
561 | while (i) { |
562 | *--p = (unsigned short)((i % 10) + '0'); |
563 | i /= 10; |
564 | } |
565 | if (negative) { |
566 | *--p = '-'; |
567 | } |
568 | } |
569 | |
570 | return UString(p, static_cast<int>(end - p)); |
571 | } |
572 | |
573 | UString UString::from(unsigned int u) |
574 | { |
575 | UChar buf[sizeof(u) * 3]; |
576 | UChar *end = buf + sizeof(buf) / sizeof(UChar); |
577 | UChar *p = end; |
578 | |
579 | if (u == 0) { |
580 | *--p = '0'; |
581 | } else { |
582 | while (u) { |
583 | *--p = (unsigned short)((u % 10) + '0'); |
584 | u /= 10; |
585 | } |
586 | } |
587 | |
588 | return UString(p, static_cast<int>(end - p)); |
589 | } |
590 | |
591 | UString UString::from(long l) |
592 | { |
593 | UChar buf[1 + sizeof(l) * 3]; |
594 | UChar *end = buf + sizeof(buf) / sizeof(UChar); |
595 | UChar *p = end; |
596 | |
597 | if (l == 0) { |
598 | *--p = '0'; |
599 | } else if (l == LONG_MIN) { |
600 | char minBuf[1 + sizeof(l) * 3]; |
601 | sprintf(minBuf, "%ld" , LONG_MIN); |
602 | return UString(minBuf); |
603 | } else { |
604 | bool negative = false; |
605 | if (l < 0) { |
606 | negative = true; |
607 | l = -l; |
608 | } |
609 | while (l) { |
610 | *--p = (unsigned short)((l % 10) + '0'); |
611 | l /= 10; |
612 | } |
613 | if (negative) { |
614 | *--p = '-'; |
615 | } |
616 | } |
617 | |
618 | return UString(p, static_cast<int>(end - p)); |
619 | } |
620 | |
621 | UString UString::from(double d) |
622 | { |
623 | // avoid ever printing -NaN, in JS conceptually there is only one NaN value |
624 | if (isNaN(d)) |
625 | return UString("NaN" , 3); |
626 | |
627 | char buf[80]; |
628 | int decimalPoint; |
629 | int sign; |
630 | |
631 | char *result = kjs_dtoa(d, 0, 0, &decimalPoint, &sign, NULL); |
632 | int length = static_cast<int>(strlen(result)); |
633 | |
634 | int i = 0; |
635 | if (sign) { |
636 | buf[i++] = '-'; |
637 | } |
638 | |
639 | if (decimalPoint <= 0 && decimalPoint > -6) { |
640 | buf[i++] = '0'; |
641 | buf[i++] = '.'; |
642 | for (int j = decimalPoint; j < 0; j++) { |
643 | buf[i++] = '0'; |
644 | } |
645 | strcpy(buf + i, result); |
646 | i += length; |
647 | } else if (decimalPoint <= 21 && decimalPoint > 0) { |
648 | if (length <= decimalPoint) { |
649 | strcpy(buf + i, result); |
650 | i += length; |
651 | for (int j = 0; j < decimalPoint - length; j++) { |
652 | buf[i++] = '0'; |
653 | } |
654 | // buf[i] = '\0'; |
655 | } else { |
656 | strncpy(buf + i, result, decimalPoint); |
657 | i += decimalPoint; |
658 | buf[i++] = '.'; |
659 | strcpy(buf + i, result + decimalPoint); |
660 | i += length - decimalPoint; |
661 | } |
662 | } else if (result[0] < '0' || result[0] > '9') { |
663 | strcpy(buf + i, result); |
664 | i += length; |
665 | } else { |
666 | buf[i++] = result[0]; |
667 | if (length > 1) { |
668 | buf[i++] = '.'; |
669 | strcpy(buf + i, result + 1); |
670 | i += length - 1; |
671 | } |
672 | |
673 | buf[i++] = 'e'; |
674 | buf[i++] = (decimalPoint >= 0) ? '+' : '-'; |
675 | // decimalPoint can't be more than 3 digits decimal given the |
676 | // nature of float representation |
677 | int exponential = decimalPoint - 1; |
678 | if (exponential < 0) { |
679 | exponential = exponential * -1; |
680 | } |
681 | if (exponential >= 100) { |
682 | buf[i++] = '0' + exponential / 100; |
683 | } |
684 | if (exponential >= 10) { |
685 | buf[i++] = '0' + (exponential % 100) / 10; |
686 | } |
687 | buf[i++] = '0' + exponential % 10; |
688 | // buf[i++] = '\0'; |
689 | } |
690 | |
691 | kjs_freedtoa(result); |
692 | |
693 | return UString(buf, i); |
694 | } |
695 | |
696 | UString UString::spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const |
697 | { |
698 | if (rangeCount == 1 && separatorCount == 0) { |
699 | int thisSize = size(); |
700 | int position = substringRanges[0].position; |
701 | int length = substringRanges[0].length; |
702 | if (position <= 0 && length >= thisSize) |
703 | return *this; |
704 | return UString::Rep::create(m_rep, maxInt(0, position), minInt(thisSize, length)); |
705 | } |
706 | |
707 | int totalLength = 0; |
708 | for (int i = 0; i < rangeCount; i++) |
709 | totalLength += substringRanges[i].length; |
710 | for (int i = 0; i < separatorCount; i++) |
711 | totalLength += separators[i].size(); |
712 | |
713 | if (totalLength == 0) |
714 | return "" ; |
715 | |
716 | UChar* buffer = allocChars(totalLength); |
717 | if (!buffer) |
718 | return null(); |
719 | |
720 | int maxCount = max(rangeCount, separatorCount); |
721 | int bufferPos = 0; |
722 | for (int i = 0; i < maxCount; i++) { |
723 | if (i < rangeCount) { |
724 | memcpy(buffer + bufferPos, data() + substringRanges[i].position, substringRanges[i].length * sizeof(UChar)); |
725 | bufferPos += substringRanges[i].length; |
726 | } |
727 | if (i < separatorCount) { |
728 | memcpy(buffer + bufferPos, separators[i].data(), separators[i].size() * sizeof(UChar)); |
729 | bufferPos += separators[i].size(); |
730 | } |
731 | } |
732 | |
733 | return UString::Rep::create(buffer, totalLength); |
734 | } |
735 | |
736 | // Append a sub-string of <subStr> to this string. |
737 | // Equivalent to append(subStr.substr(subPos, subLength)) |
738 | |
739 | UString& UString::append(const UString& subStr, int subPos, int subLength) |
740 | { |
741 | int subSize = subStr.size(); |
742 | |
743 | if (subPos < 0) |
744 | subPos = 0; |
745 | else if (subPos >= subSize) |
746 | subPos = subSize; |
747 | if (subLength < 0) |
748 | subLength = subSize; |
749 | if (subPos + subLength >= subSize) |
750 | subLength = subSize - subPos; |
751 | |
752 | return append(UString(subStr.data() + subPos, subLength)); |
753 | } |
754 | |
755 | UString &UString::append(const UString &t) |
756 | { |
757 | int thisSize = size(); |
758 | int thisOffset = m_rep->offset; |
759 | int tSize = t.size(); |
760 | int length = thisSize + tSize; |
761 | |
762 | // possible cases: |
763 | if (thisSize == 0) { |
764 | // this is empty |
765 | *this = t; |
766 | } else if (tSize == 0) { |
767 | // t is empty |
768 | } else if (m_rep->baseIsSelf() && m_rep->rc == 1) { |
769 | // this is direct and has refcount of 1 (so we can just alter it directly) |
770 | expandCapacity(thisOffset + length); |
771 | if (data()) { |
772 | memcpy(const_cast<UChar*>(data() + thisSize), t.data(), tSize * sizeof(UChar)); |
773 | m_rep->len = length; |
774 | m_rep->_hash = 0; |
775 | } |
776 | } else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) { |
777 | // this reaches the end of the buffer - extend it if it's long enough to append to |
778 | expandCapacity(thisOffset + length); |
779 | if (data()) { |
780 | memcpy(const_cast<UChar*>(data() + thisSize), t.data(), tSize * sizeof(UChar)); |
781 | m_rep = Rep::create(m_rep, 0, length); |
782 | } |
783 | } else { |
784 | // this is shared with someone using more capacity, gotta make a whole new string |
785 | size_t newCapacity = expandedSize(length, 0); |
786 | UChar* d = allocChars(newCapacity); |
787 | if (!d) |
788 | m_rep = &Rep::null; |
789 | else { |
790 | memcpy(d, data(), thisSize * sizeof(UChar)); |
791 | memcpy(const_cast<UChar*>(d + thisSize), t.data(), tSize * sizeof(UChar)); |
792 | m_rep = Rep::create(d, length); |
793 | m_rep->capacity = newCapacity; |
794 | } |
795 | } |
796 | |
797 | return *this; |
798 | } |
799 | |
800 | |
801 | UString &UString::append(const char *t) |
802 | { |
803 | int thisSize = size(); |
804 | int thisOffset = m_rep->offset; |
805 | int tSize = static_cast<int>(strlen(t)); |
806 | int length = thisSize + tSize; |
807 | |
808 | // possible cases: |
809 | if (thisSize == 0) { |
810 | // this is empty |
811 | *this = t; |
812 | } else if (tSize == 0) { |
813 | // t is empty, we'll just return *this below. |
814 | } else if (m_rep->baseIsSelf() && m_rep->rc == 1) { |
815 | // this is direct and has refcount of 1 (so we can just alter it directly) |
816 | expandCapacity(thisOffset + length); |
817 | UChar *d = const_cast<UChar *>(data()); |
818 | if (d) { |
819 | for (int i = 0; i < tSize; ++i) |
820 | d[thisSize + i] = t[i]; |
821 | m_rep->len = length; |
822 | m_rep->_hash = 0; |
823 | } |
824 | } else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) { |
825 | // this string reaches the end of the buffer - extend it |
826 | expandCapacity(thisOffset + length); |
827 | UChar *d = const_cast<UChar *>(data()); |
828 | if (d) { |
829 | for (int i = 0; i < tSize; ++i) |
830 | d[thisSize + i] = t[i]; |
831 | m_rep = Rep::create(m_rep, 0, length); |
832 | } |
833 | } else { |
834 | // this is shared with someone using more capacity, gotta make a whole new string |
835 | size_t newCapacity = expandedSize(length, 0); |
836 | UChar* d = allocChars(newCapacity); |
837 | if (!d) |
838 | m_rep = &Rep::null; |
839 | else { |
840 | memcpy(d, data(), thisSize * sizeof(UChar)); |
841 | for (int i = 0; i < tSize; ++i) |
842 | d[thisSize + i] = t[i]; |
843 | m_rep = Rep::create(d, length); |
844 | m_rep->capacity = newCapacity; |
845 | } |
846 | } |
847 | |
848 | return *this; |
849 | } |
850 | |
851 | UString &UString::append(unsigned short c) |
852 | { |
853 | int thisOffset = m_rep->offset; |
854 | int length = size(); |
855 | |
856 | // possible cases: |
857 | if (length == 0) { |
858 | // this is empty - must make a new m_rep because we don't want to pollute the shared empty one |
859 | size_t newCapacity = expandedSize(1, 0); |
860 | UChar* d = allocChars(newCapacity); |
861 | if (!d) |
862 | m_rep = &Rep::null; |
863 | else { |
864 | d[0] = c; |
865 | m_rep = Rep::create(d, 1); |
866 | m_rep->capacity = newCapacity; |
867 | } |
868 | } else if (m_rep->baseIsSelf() && m_rep->rc == 1) { |
869 | // this is direct and has refcount of 1 (so we can just alter it directly) |
870 | expandCapacity(thisOffset + length + 1); |
871 | UChar *d = const_cast<UChar *>(data()); |
872 | if (d) { |
873 | d[length] = c; |
874 | m_rep->len = length + 1; |
875 | m_rep->_hash = 0; |
876 | } |
877 | } else if (thisOffset + length == usedCapacity() && length >= minShareSize) { |
878 | // this reaches the end of the string - extend it and share |
879 | expandCapacity(thisOffset + length + 1); |
880 | UChar *d = const_cast<UChar *>(data()); |
881 | if (d) { |
882 | d[length] = c; |
883 | m_rep = Rep::create(m_rep, 0, length + 1); |
884 | } |
885 | } else { |
886 | // this is shared with someone using more capacity, gotta make a whole new string |
887 | size_t newCapacity = expandedSize(length + 1, 0); |
888 | UChar* d = allocChars(newCapacity); |
889 | if (!d) |
890 | m_rep = &Rep::null; |
891 | else { |
892 | memcpy(d, data(), length * sizeof(UChar)); |
893 | d[length] = c; |
894 | m_rep = Rep::create(d, length + 1); |
895 | m_rep->capacity = newCapacity; |
896 | } |
897 | } |
898 | |
899 | return *this; |
900 | } |
901 | |
902 | CString UString::cstring() const |
903 | { |
904 | return ascii(); |
905 | } |
906 | |
907 | char *UString::ascii() const |
908 | { |
909 | // Never make the buffer smaller than normalStatBufferSize. |
910 | // Thus we almost never need to reallocate. |
911 | int length = size(); |
912 | int neededSize = length + 1; |
913 | if (neededSize < normalStatBufferSize) { |
914 | neededSize = normalStatBufferSize; |
915 | } |
916 | if (neededSize != statBufferSize) { |
917 | delete [] statBuffer; |
918 | statBuffer = new char [neededSize]; |
919 | statBufferSize = neededSize; |
920 | } |
921 | |
922 | const UChar *p = data(); |
923 | char *q = statBuffer; |
924 | const UChar *limit = p + length; |
925 | while (p != limit) { |
926 | *q = static_cast<char>(p->uc); |
927 | ++p; |
928 | ++q; |
929 | } |
930 | *q = '\0'; |
931 | |
932 | return statBuffer; |
933 | } |
934 | |
935 | UString& UString::operator=(Empty) |
936 | { |
937 | m_rep = &Rep::empty; |
938 | |
939 | return *this; |
940 | } |
941 | |
942 | UString& UString::operator=(const char* c) |
943 | { |
944 | set(c, c ? strlen(c) : 0); |
945 | |
946 | return *this; |
947 | } |
948 | |
949 | void UString::set(const char* c, int l) |
950 | { |
951 | if (!c) { |
952 | m_rep = &Rep::null; |
953 | return; |
954 | } |
955 | |
956 | if (l == 0) { |
957 | m_rep = &Rep::empty; |
958 | return; |
959 | } |
960 | |
961 | UChar *d; |
962 | if (m_rep->rc == 1 && l <= m_rep->capacity && m_rep->baseIsSelf() && m_rep->offset == 0 && m_rep->preCapacity == 0) { |
963 | d = m_rep->buf; |
964 | m_rep->_hash = 0; |
965 | m_rep->len = l; |
966 | } else { |
967 | d = allocChars(l); |
968 | if (!d) { |
969 | m_rep = &Rep::null; |
970 | return; |
971 | } |
972 | m_rep = Rep::create(d, l); |
973 | } |
974 | for (int i = 0; i < l; i++) |
975 | d[i].uc = static_cast<unsigned char>(c[i]); |
976 | } |
977 | |
978 | bool UString::is8Bit() const |
979 | { |
980 | const UChar *u = data(); |
981 | const UChar *limit = u + size(); |
982 | while (u < limit) { |
983 | if (u->uc > 0xFF) |
984 | return false; |
985 | ++u; |
986 | } |
987 | |
988 | return true; |
989 | } |
990 | |
991 | const UChar UString::operator[](int pos) const |
992 | { |
993 | if (pos >= size()) |
994 | return '\0'; |
995 | return data()[pos]; |
996 | } |
997 | |
998 | double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const |
999 | { |
1000 | double d; |
1001 | |
1002 | const int length = size(); |
1003 | int leadingSpaces = 0; |
1004 | |
1005 | // skip leading white space |
1006 | while (leadingSpaces < length && CommonUnicode::isStrWhiteSpace(data()[leadingSpaces].uc)) |
1007 | ++leadingSpaces; |
1008 | |
1009 | UString whitespaceSkipped = substr(leadingSpaces, length - leadingSpaces); |
1010 | |
1011 | // FIXME: If tolerateTrailingJunk is true, then we want to tolerate non-8-bit junk |
1012 | // after the number, so is8Bit is too strict a check. |
1013 | if (!whitespaceSkipped.is8Bit()) |
1014 | return NaN; |
1015 | |
1016 | const char *c = whitespaceSkipped.ascii(); |
1017 | |
1018 | // empty string ? |
1019 | if (*c == '\0') |
1020 | return tolerateEmptyString ? 0.0 : NaN; |
1021 | |
1022 | // hex number ? |
1023 | if (*c == '0' && (*(c+1) == 'x' || *(c+1) == 'X')) { |
1024 | const char* firstDigitPosition = c + 2; |
1025 | c++; |
1026 | d = 0.0; |
1027 | while (*(++c)) { |
1028 | if (*c >= '0' && *c <= '9') |
1029 | d = d * 16.0 + *c - '0'; |
1030 | else if ((*c >= 'A' && *c <= 'F') || (*c >= 'a' && *c <= 'f')) |
1031 | d = d * 16.0 + (*c & 0xdf) - 'A' + 10.0; |
1032 | else |
1033 | break; |
1034 | } |
1035 | |
1036 | if (d >= mantissaOverflowLowerBound) |
1037 | d = parseIntOverflow(firstDigitPosition, c - firstDigitPosition, 16); |
1038 | } else { |
1039 | // regular number ? |
1040 | char *end; |
1041 | d = kjs_strtod(c, &end); |
1042 | if ((d != 0.0 || end != c) && d != Inf && d != -Inf) { |
1043 | c = end; |
1044 | } else { |
1045 | double sign = 1.0; |
1046 | |
1047 | if (*c == '+') |
1048 | c++; |
1049 | else if (*c == '-') { |
1050 | sign = -1.0; |
1051 | c++; |
1052 | } |
1053 | |
1054 | // We used strtod() to do the conversion. However, strtod() handles |
1055 | // infinite values slightly differently than JavaScript in that it |
1056 | // converts the string "inf" with any capitalization to infinity, |
1057 | // whereas the ECMA spec requires that it be converted to NaN. |
1058 | |
1059 | if (strncmp(c, "Infinity" , 8) == 0) { |
1060 | d = sign * Inf; |
1061 | c += 8; |
1062 | } else if ((d == Inf || d == -Inf) && *c != 'I' && *c != 'i') |
1063 | c = end; |
1064 | else |
1065 | return NaN; |
1066 | } |
1067 | } |
1068 | |
1069 | // allow trailing white space |
1070 | while (isASCIISpace(*c)) |
1071 | c++; |
1072 | // don't allow anything after - unless tolerant=true |
1073 | if (!tolerateTrailingJunk && *c != '\0') |
1074 | d = NaN; |
1075 | |
1076 | return d; |
1077 | } |
1078 | |
1079 | #ifdef __FAST_MATH__ |
1080 | # error "KJS does not work correctly with -ffast-math" |
1081 | #endif |
1082 | |
1083 | double UString::toDouble(bool tolerateTrailingJunk) const |
1084 | { |
1085 | return toDouble(tolerateTrailingJunk, true); |
1086 | } |
1087 | |
1088 | double UString::toDouble() const |
1089 | { |
1090 | return toDouble(false, true); |
1091 | } |
1092 | |
1093 | uint32_t UString::toStrictUInt32(bool *ok) const |
1094 | { |
1095 | if (ok) |
1096 | *ok = false; |
1097 | |
1098 | // Empty string is not OK. |
1099 | int len = m_rep->len; |
1100 | if (len == 0) |
1101 | return 0; |
1102 | const UChar *p = m_rep->data(); |
1103 | unsigned short c = p->unicode(); |
1104 | |
1105 | // If the first digit is 0, only 0 itself is OK. |
1106 | if (c == '0') { |
1107 | if (len == 1 && ok) |
1108 | *ok = true; |
1109 | return 0; |
1110 | } |
1111 | |
1112 | // Convert to UInt32, checking for overflow. |
1113 | uint32_t i = 0; |
1114 | while (1) { |
1115 | // Process character, turning it into a digit. |
1116 | if (c < '0' || c > '9') |
1117 | return 0; |
1118 | const unsigned d = c - '0'; |
1119 | |
1120 | // Multiply by 10, checking for overflow out of 32 bits. |
1121 | if (i > 0xFFFFFFFFU / 10) |
1122 | return 0; |
1123 | i *= 10; |
1124 | |
1125 | // Add in the digit, checking for overflow out of 32 bits. |
1126 | const unsigned max = 0xFFFFFFFFU - d; |
1127 | if (i > max) |
1128 | return 0; |
1129 | i += d; |
1130 | |
1131 | // Handle end of string. |
1132 | if (--len == 0) { |
1133 | if (ok) |
1134 | *ok = true; |
1135 | return i; |
1136 | } |
1137 | |
1138 | // Get next character. |
1139 | c = (++p)->unicode(); |
1140 | } |
1141 | } |
1142 | |
1143 | int UString::find(const UString &f, int pos) const |
1144 | { |
1145 | int sz = size(); |
1146 | int fsz = f.size(); |
1147 | if (sz < fsz) |
1148 | return -1; |
1149 | if (pos < 0) |
1150 | pos = 0; |
1151 | if (fsz == 0) |
1152 | return pos; |
1153 | const UChar* data_ = data(); |
1154 | const UChar* end = data_ + sz - fsz; |
1155 | int fsizeminusone = (fsz - 1) * sizeof(UChar); |
1156 | const UChar *fdata = f.data(); |
1157 | unsigned short fchar = fdata->uc; |
1158 | ++fdata; |
1159 | for (const UChar* c = data_ + pos; c <= end; c++) |
1160 | if (c->uc == fchar && !memcmp(c + 1, fdata, fsizeminusone)) |
1161 | return (c - data_); |
1162 | |
1163 | return -1; |
1164 | } |
1165 | |
1166 | int UString::find(UChar ch, int pos) const |
1167 | { |
1168 | if (pos < 0) |
1169 | pos = 0; |
1170 | const UChar* data_ = data(); |
1171 | const UChar *end = data_ + size(); |
1172 | for (const UChar *c = data_ + pos; c < end; c++) |
1173 | if (*c == ch) |
1174 | return (c - data_); |
1175 | |
1176 | return -1; |
1177 | } |
1178 | |
1179 | int UString::rfind(const UString &f, int pos) const |
1180 | { |
1181 | int sz = size(); |
1182 | int fsz = f.size(); |
1183 | if (sz < fsz) |
1184 | return -1; |
1185 | if (pos < 0) |
1186 | pos = 0; |
1187 | if (pos > sz - fsz) |
1188 | pos = sz - fsz; |
1189 | if (fsz == 0) |
1190 | return pos; |
1191 | int fsizeminusone = (fsz - 1) * sizeof(UChar); |
1192 | const UChar *fdata = f.data(); |
1193 | const UChar* data_ = data(); |
1194 | for (const UChar* c = data_ + pos; c >= data_; c--) { |
1195 | if (*c == *fdata && !memcmp(c + 1, fdata + 1, fsizeminusone)) |
1196 | return (c - data_); |
1197 | } |
1198 | |
1199 | return -1; |
1200 | } |
1201 | |
1202 | int UString::rfind(UChar ch, int pos) const |
1203 | { |
1204 | if (isEmpty()) |
1205 | return -1; |
1206 | if (pos + 1 >= size()) |
1207 | pos = size() - 1; |
1208 | const UChar* data_ = data(); |
1209 | for (const UChar* c = data_ + pos; c >= data_; c--) { |
1210 | if (*c == ch) |
1211 | return (c - data_); |
1212 | } |
1213 | |
1214 | return -1; |
1215 | } |
1216 | |
1217 | UString UString::substr(int pos, int len) const |
1218 | { |
1219 | int s = size(); |
1220 | |
1221 | if (pos < 0) |
1222 | pos = 0; |
1223 | else if (pos >= s) |
1224 | pos = s; |
1225 | if (len < 0) |
1226 | len = s; |
1227 | if (pos + len >= s) |
1228 | len = s - pos; |
1229 | |
1230 | if (pos == 0 && len == s) |
1231 | return *this; |
1232 | |
1233 | return UString(Rep::create(m_rep, pos, len)); |
1234 | } |
1235 | |
1236 | void UString::copyForWriting() |
1237 | { |
1238 | int l = size(); |
1239 | if (!l) return; // Not going to touch anything anyway. |
1240 | if (m_rep->rc > 1 || !m_rep->baseIsSelf()) { |
1241 | UChar* n = allocChars(l); |
1242 | memcpy(n, data(), l * sizeof(UChar)); |
1243 | m_rep = Rep::create(n, l); |
1244 | } |
1245 | } |
1246 | |
1247 | bool operator==(const UString& s1, const UString& s2) |
1248 | { |
1249 | #if 0 |
1250 | if (s1.m_rep == s2.m_rep) |
1251 | return true; |
1252 | #endif |
1253 | |
1254 | if (s1.m_rep->len != s2.m_rep->len) |
1255 | return false; |
1256 | |
1257 | return (memcmp(s1.m_rep->data(), s2.m_rep->data(), |
1258 | s1.m_rep->len * sizeof(UChar)) == 0); |
1259 | } |
1260 | |
1261 | bool operator==(const UString& s1, const char *s2) |
1262 | { |
1263 | if (s2 == 0) { |
1264 | return s1.isEmpty(); |
1265 | } |
1266 | |
1267 | const UChar *u = s1.data(); |
1268 | const UChar *uend = u + s1.size(); |
1269 | while (u != uend && *s2) { |
1270 | if (u->uc != (unsigned char)*s2) |
1271 | return false; |
1272 | s2++; |
1273 | u++; |
1274 | } |
1275 | |
1276 | return u == uend && *s2 == 0; |
1277 | } |
1278 | |
1279 | bool operator<(const UString& s1, const UString& s2) |
1280 | { |
1281 | const int l1 = s1.size(); |
1282 | const int l2 = s2.size(); |
1283 | const int lmin = l1 < l2 ? l1 : l2; |
1284 | const UChar *c1 = s1.data(); |
1285 | const UChar *c2 = s2.data(); |
1286 | int l = 0; |
1287 | while (l < lmin && *c1 == *c2) { |
1288 | c1++; |
1289 | c2++; |
1290 | l++; |
1291 | } |
1292 | if (l < lmin) |
1293 | return (c1->uc < c2->uc); |
1294 | |
1295 | return (l1 < l2); |
1296 | } |
1297 | |
1298 | bool UString::equal(const UString::Rep *r, const UString::Rep *b) |
1299 | { |
1300 | if (r == b) |
1301 | return true; |
1302 | |
1303 | int length = r->len; |
1304 | if (length != b->len) |
1305 | return false; |
1306 | |
1307 | const UChar *d = r->data(); |
1308 | const UChar *s = b->data(); |
1309 | for (int i = 0; i != length; ++i) |
1310 | if (d[i].uc != s[i].uc) |
1311 | return false; |
1312 | return true; |
1313 | } |
1314 | |
1315 | |
1316 | int compare(const UString& s1, const UString& s2) |
1317 | { |
1318 | const int l1 = s1.size(); |
1319 | const int l2 = s2.size(); |
1320 | const int lmin = l1 < l2 ? l1 : l2; |
1321 | const UChar *c1 = s1.data(); |
1322 | const UChar *c2 = s2.data(); |
1323 | int l = 0; |
1324 | while (l < lmin && *c1 == *c2) { |
1325 | c1++; |
1326 | c2++; |
1327 | l++; |
1328 | } |
1329 | |
1330 | if (l < lmin) |
1331 | return (c1->uc > c2->uc) ? 1 : -1; |
1332 | |
1333 | if (l1 == l2) |
1334 | return 0; |
1335 | |
1336 | return (l1 > l2) ? 1 : -1; |
1337 | } |
1338 | |
1339 | inline int inlineUTF8SequenceLengthNonASCII(char b0) |
1340 | { |
1341 | if ((b0 & 0xC0) != 0xC0) |
1342 | return 0; |
1343 | if ((b0 & 0xE0) == 0xC0) |
1344 | return 2; |
1345 | if ((b0 & 0xF0) == 0xE0) |
1346 | return 3; |
1347 | if ((b0 & 0xF8) == 0xF0) |
1348 | return 4; |
1349 | return 0; |
1350 | } |
1351 | |
1352 | int UTF8SequenceLengthNonASCII(char b0) |
1353 | { |
1354 | return inlineUTF8SequenceLengthNonASCII(b0); |
1355 | } |
1356 | |
1357 | inline int inlineUTF8SequenceLength(char b0) |
1358 | { |
1359 | return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0); |
1360 | } |
1361 | |
1362 | // Given a first byte, gives the length of the UTF-8 sequence it begins. |
1363 | // Returns 0 for bytes that are not legal starts of UTF-8 sequences. |
1364 | // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). |
1365 | int UTF8SequenceLength(char b0) |
1366 | { |
1367 | return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0); |
1368 | } |
1369 | |
1370 | // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. |
1371 | // Only allows Unicode characters (U-00000000 to U-0010FFFF). |
1372 | // Returns -1 if the sequence is not valid (including presence of extra bytes). |
1373 | int decodeUTF8Sequence(const char *sequence) |
1374 | { |
1375 | // Handle 0-byte sequences (never valid). |
1376 | const unsigned char b0 = sequence[0]; |
1377 | const int length = inlineUTF8SequenceLength(b0); |
1378 | if (length == 0) |
1379 | return -1; |
1380 | |
1381 | // Handle 1-byte sequences (plain ASCII). |
1382 | const unsigned char b1 = sequence[1]; |
1383 | if (length == 1) { |
1384 | if (b1) |
1385 | return -1; |
1386 | return b0; |
1387 | } |
1388 | |
1389 | // Handle 2-byte sequences. |
1390 | if ((b1 & 0xC0) != 0x80) |
1391 | return -1; |
1392 | const unsigned char b2 = sequence[2]; |
1393 | if (length == 2) { |
1394 | if (b2) |
1395 | return -1; |
1396 | const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F); |
1397 | if (c < 0x80) |
1398 | return -1; |
1399 | return c; |
1400 | } |
1401 | |
1402 | // Handle 3-byte sequences. |
1403 | if ((b2 & 0xC0) != 0x80) |
1404 | return -1; |
1405 | const unsigned char b3 = sequence[3]; |
1406 | if (length == 3) { |
1407 | if (b3) |
1408 | return -1; |
1409 | const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); |
1410 | if (c < 0x800) |
1411 | return -1; |
1412 | // UTF-16 surrogates should never appear in UTF-8 data. |
1413 | if (c >= 0xD800 && c <= 0xDFFF) |
1414 | return -1; |
1415 | // Backwards BOM and U+FFFF should never appear in UTF-8 data. |
1416 | if (c == 0xFFFE || c == 0xFFFF) |
1417 | return -1; |
1418 | return c; |
1419 | } |
1420 | |
1421 | // Handle 4-byte sequences. |
1422 | if ((b3 & 0xC0) != 0x80) |
1423 | return -1; |
1424 | const unsigned char b4 = sequence[4]; |
1425 | if (length == 4) { |
1426 | if (b4) |
1427 | return -1; |
1428 | const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); |
1429 | if (c < 0x10000 || c > 0x10FFFF) |
1430 | return -1; |
1431 | return c; |
1432 | } |
1433 | |
1434 | return -1; |
1435 | } |
1436 | |
1437 | CString UString::UTF8String() const |
1438 | { |
1439 | // Allocate a buffer big enough to hold all the characters. |
1440 | const int length = size(); |
1441 | Vector<char, 1024> buffer(length * 3); |
1442 | |
1443 | // Convert to runs of 8-bit characters. |
1444 | char *p = buffer.begin(); |
1445 | const unsigned short* d = &data()->uc; |
1446 | for (int i = 0; i != length; ++i) { |
1447 | unsigned int c = d[i], sc; |
1448 | if (c < 0x80) { |
1449 | *p++ = (char)c; |
1450 | } else if (c < 0x800) { |
1451 | *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 |
1452 | *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set |
1453 | } else if (c >= 0xD800 && c <= 0xDBFF && (i+1) < length && |
1454 | (sc = d[i+1]) >= 0xDC00 && sc <= 0xDFFF) { |
1455 | sc = 0x10000 + (((c & 0x3FF) << 10) | (sc & 0x3FF)); |
1456 | *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8 |
1457 | *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set |
1458 | *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set |
1459 | *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set |
1460 | ++i; |
1461 | } else { |
1462 | *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 |
1463 | *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set |
1464 | *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set |
1465 | } |
1466 | } |
1467 | |
1468 | // Return the result as a C string. |
1469 | CString result(buffer.data(), p - buffer.data()); |
1470 | |
1471 | return result; |
1472 | } |
1473 | |
1474 | } // namespace KJS |
1475 | |