1// -*- c-basic-offset: 2 -*-
2/*
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
5 * Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
6 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22 *
23 */
24
25#include "ustring.h"
26#include <config-kjs.h>
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include "wtf/DisallowCType.h"
32#include "wtf/ASCIICType.h"
33#if HAVE(STRING_H)
34#include <string.h>
35#endif
36#if HAVE(STRINGS_H)
37#include <strings.h>
38#endif
39#include <limits.h>
40
41#include "operations.h"
42#include "function.h"
43#include "identifier.h"
44#include <math.h>
45#include "dtoa.h"
46#include "collector.h"
47#include "commonunicode.h"
48
49#include <wtf/Vector.h>
50
51using std::max;
52
53// GCC cstring uses these automatically, but not all implementations do.
54using std::strlen;
55using std::strcpy;
56using std::strncpy;
57using std::memset;
58using std::memcpy;
59
60using namespace WTF;
61
62namespace KJS {
63
64extern const double NaN;
65extern const double Inf;
66
67static inline size_t overflowIndicator() { return std::numeric_limits<size_t>::max(); }
68static inline size_t maxUChars() { return std::numeric_limits<size_t>::max() / sizeof(UChar); }
69
70static inline UChar* allocChars(size_t length)
71{
72 assert(length);
73 if (length > maxUChars())
74 return 0;
75 return static_cast<UChar*>(fastMalloc(sizeof(UChar) * length));
76}
77
78static inline UChar* reallocChars(UChar* buffer, size_t length)
79{
80 ASSERT(length);
81 if (length > maxUChars())
82 return 0;
83 return static_cast<UChar*>(fastRealloc(buffer, sizeof(UChar) * length));
84}
85
86CString::CString(const char *c)
87{
88 length = strlen(c);
89 data = new char[length+1];
90 memcpy(data, c, length + 1);
91}
92
93CString::CString(const char *c, size_t len)
94{
95 length = len;
96 data = new char[len+1];
97 memcpy(data, c, len);
98 data[len] = 0;
99}
100
101CString::CString(const CString &b)
102{
103 length = b.length;
104 if (length > 0 && b.data) {
105 data = new char[length+1];
106 memcpy(data, b.data, length + 1);
107 }
108 else
109 data = 0;
110}
111
112CString::~CString()
113{
114 delete [] data;
115}
116
117CString &CString::operator=(const char *c)
118{
119 if (data)
120 delete [] data;
121 length = strlen(c);
122 data = new char[length+1];
123 memcpy(data, c, length + 1);
124
125 return *this;
126}
127
128CString &CString::operator=(const CString &str)
129{
130 if (this == &str)
131 return *this;
132
133 if (data)
134 delete [] data;
135 length = str.length;
136 if (str.data) {
137 data = new char[length + 1];
138 memcpy(data, str.data, length + 1);
139 }
140 else
141 data = 0;
142
143 return *this;
144}
145
146bool operator==(const CString& c1, const CString& c2)
147{
148 size_t len = c1.size();
149 return len == c2.size() && (len == 0 || memcmp(c1.c_str(), c2.c_str(), len) == 0);
150}
151
152// Hack here to avoid a global with a constructor; point to an unsigned short instead of a UChar.
153static unsigned short almostUChar;
154UString::Rep UString::Rep::null = { 0, 0, 1, 0, 0, &UString::Rep::null, 0, 0, 0, 0, 0, 0 };
155UString::Rep UString::Rep::empty = { 0, 0, 1, 0, 0, &UString::Rep::empty, 0, reinterpret_cast<UChar*>(&almostUChar), 0, 0, 0, 0 };
156const int normalStatBufferSize = 4096;
157static char *statBuffer = 0; // FIXME: This buffer is never deallocated.
158static int statBufferSize = 0;
159
160PassRefPtr<UString::Rep> UString::Rep::createCopying (const UChar* d, int length)
161{
162 UChar* copyD = allocChars(length);
163 memcpy(copyD, d, length * sizeof(UChar));
164
165 return create(copyD, length);
166}
167
168PassRefPtr<UString::Rep> UString::Rep::create(UChar *d, int l)
169{
170 Rep* r = new Rep;
171 r->offset = 0;
172 r->len = l;
173 r->rc = 1;
174 r->_hash = 0;
175 r->isIdentifier = 0;
176 r->baseString = r;
177 r->reportedCost = 0;
178 r->buf = d;
179 r->usedCapacity = l;
180 r->capacity = l;
181 r->usedPreCapacity = 0;
182 r->preCapacity = 0;
183
184 // steal the single reference this Rep was created with
185 return adoptRef(r);
186}
187
188PassRefPtr<UString::Rep> UString::Rep::create(PassRefPtr<Rep> base, int offset, int length)
189{
190 assert(base);
191
192 int baseOffset = base->offset;
193
194 base = base->baseString;
195
196 assert(-(offset + baseOffset) <= base->usedPreCapacity);
197 assert(offset + baseOffset + length <= base->usedCapacity);
198
199 Rep* r = new Rep;
200 r->offset = baseOffset + offset;
201 r->len = length;
202 r->rc = 1;
203 r->_hash = 0;
204 r->isIdentifier = 0;
205 r->baseString = base.releaseRef();
206 r->reportedCost = 0;
207 r->buf = 0;
208 r->usedCapacity = 0;
209 r->capacity = 0;
210 r->usedPreCapacity = 0;
211 r->preCapacity = 0;
212
213 // steal the single reference this Rep was created with
214 return adoptRef(r);
215}
216
217void UString::Rep::destroy()
218{
219 if (isIdentifier)
220 Identifier::remove(this);
221 if (baseString != this) {
222 baseString->deref();
223 } else {
224 fastFree(buf);
225 }
226 delete this;
227}
228
229// Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
230// or anything like that.
231const unsigned PHI = 0x9e3779b9U;
232
233// Paul Hsieh's SuperFastHash
234// http://www.azillionmonkeys.com/qed/hash.html
235unsigned UString::Rep::computeHash(const UChar *s, int len)
236{
237 unsigned l = len;
238 uint32_t hash = PHI;
239 uint32_t tmp;
240
241 int rem = l & 1;
242 l >>= 1;
243
244 // Main loop
245 for (; l > 0; l--) {
246 hash += s[0].uc;
247 tmp = (s[1].uc << 11) ^ hash;
248 hash = (hash << 16) ^ tmp;
249 s += 2;
250 hash += hash >> 11;
251 }
252
253 // Handle end case
254 if (rem) {
255 hash += s[0].uc;
256 hash ^= hash << 11;
257 hash += hash >> 17;
258 }
259
260 // Force "avalanching" of final 127 bits
261 hash ^= hash << 3;
262 hash += hash >> 5;
263 hash ^= hash << 2;
264 hash += hash >> 15;
265 hash ^= hash << 10;
266
267 // this avoids ever returning a hash code of 0, since that is used to
268 // signal "hash not computed yet", using a value that is likely to be
269 // effectively the same as 0 when the low bits are masked
270 if (hash == 0)
271 hash = 0x80000000;
272
273 return hash;
274}
275
276// Paul Hsieh's SuperFastHash
277// http://www.azillionmonkeys.com/qed/hash.html
278unsigned UString::Rep::computeHash(const char* s, int len)
279{
280 // This hash is designed to work on 16-bit chunks at a time. But since the normal case
281 // (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
282 // were 16-bit chunks, which should give matching results
283
284 uint32_t hash = PHI;
285 uint32_t tmp;
286 unsigned l = len;
287
288 int rem = l & 1;
289 l >>= 1;
290
291 // Main loop
292 for (; l > 0; l--) {
293 hash += (unsigned char)s[0];
294 tmp = ((unsigned char)s[1] << 11) ^ hash;
295 hash = (hash << 16) ^ tmp;
296 s += 2;
297 hash += hash >> 11;
298 }
299
300 // Handle end case
301 if (rem) {
302 hash += (unsigned char)s[0];
303 hash ^= hash << 11;
304 hash += hash >> 17;
305 }
306
307 // Force "avalanching" of final 127 bits
308 hash ^= hash << 3;
309 hash += hash >> 5;
310 hash ^= hash << 2;
311 hash += hash >> 15;
312 hash ^= hash << 10;
313
314 // this avoids ever returning a hash code of 0, since that is used to
315 // signal "hash not computed yet", using a value that is likely to be
316 // effectively the same as 0 when the low bits are masked
317 if (hash == 0)
318 hash = 0x80000000;
319
320 return hash;
321}
322
323unsigned UString::Rep::computeHash(const char* s)
324{
325 return computeHash(s, strlen(s));
326}
327
328// put these early so they can be inlined
329inline size_t UString::expandedSize(size_t size, size_t otherSize) const
330{
331 // Do the size calculation in two parts, returning overflowIndicator if
332 // we overflow the maximum value that we can handle.
333
334 if (size > maxUChars())
335 return overflowIndicator();
336
337 size_t expandedSize = ((size + 10) / 10 * 11) + 1;
338 if (maxUChars() - expandedSize < otherSize)
339 return overflowIndicator();
340
341 return expandedSize + otherSize;
342}
343
344inline int UString::usedCapacity() const
345{
346 return m_rep->baseString->usedCapacity;
347}
348
349inline int UString::usedPreCapacity() const
350{
351 return m_rep->baseString->usedPreCapacity;
352}
353
354void UString::expandCapacity(int requiredLength)
355{
356 Rep* r = m_rep->baseString;
357
358 if (requiredLength > r->capacity) {
359 size_t newCapacity = expandedSize(requiredLength, r->preCapacity);
360 UChar* oldBuf = r->buf;
361 r->buf = reallocChars(r->buf, newCapacity);
362 if (!r->buf) {
363 r->buf = oldBuf;
364 m_rep = &Rep::null;
365 return;
366 }
367 r->capacity = newCapacity - r->preCapacity;
368 }
369 if (requiredLength > r->usedCapacity) {
370 r->usedCapacity = requiredLength;
371 }
372}
373
374void UString::expandPreCapacity(int requiredPreCap)
375{
376 Rep* r = m_rep->baseString;
377
378 if (requiredPreCap > r->preCapacity) {
379 size_t newCapacity = expandedSize(requiredPreCap, r->capacity);
380 int delta = newCapacity - r->capacity - r->preCapacity;
381
382 UChar* newBuf = allocChars(newCapacity);
383 if (!newBuf) {
384 m_rep = &Rep::null;
385 return;
386 }
387 memcpy(newBuf + delta, r->buf, (r->capacity + r->preCapacity) * sizeof(UChar));
388 fastFree(r->buf);
389 r->buf = newBuf;
390
391 r->preCapacity = newCapacity - r->capacity;
392 }
393 if (requiredPreCap > r->usedPreCapacity) {
394 r->usedPreCapacity = requiredPreCap;
395 }
396}
397
398
399UString::UString(Empty)
400 : m_rep(&Rep::empty)
401{
402}
403
404UString::UString(char c)
405 : m_rep(Rep::create(allocChars(1), 1))
406{
407 m_rep->buf[0] = static_cast<unsigned char>(c);
408}
409
410UString::UString(const char* c)
411{
412 if (!c) {
413 m_rep = &Rep::null;
414 return;
415 }
416
417 if (!c[0]) {
418 m_rep = &Rep::empty;
419 return;
420 }
421
422 size_t length = strlen(c);
423 UChar *d = allocChars(length);
424 if (!d)
425 m_rep = &Rep::null;
426 else {
427 for (size_t i = 0; i < length; i++)
428 d[i].uc = c[i];
429 m_rep = Rep::create(d, static_cast<int>(length));
430 }
431}
432
433UString::UString(const char* c, size_t length)
434{
435 if (!c) {
436 m_rep = &Rep::null;
437 return;
438 }
439
440 if (length == 0) {
441 m_rep = &Rep::empty;
442 return;
443 }
444
445 UChar* d = allocChars(length);
446 if (!d)
447 m_rep = &Rep::null;
448 else {
449 for (size_t i = 0; i < length; i++)
450 d[i].uc = c[i];
451 m_rep = Rep::create(d, static_cast<int>(length));
452 }
453}
454
455UString::UString(const UChar* c, int length)
456{
457 if (length == 0)
458 m_rep = &Rep::empty;
459 else
460 m_rep = Rep::createCopying(c, length);
461}
462
463UString::UString(UChar* c, int length, bool copy)
464{
465 if (length == 0)
466 m_rep = &Rep::empty;
467 else if (copy)
468 m_rep = Rep::createCopying(c, length);
469 else
470 m_rep = Rep::create(c, length);
471}
472
473UString::UString(const Vector<UChar>& buffer)
474{
475 if (!buffer.size())
476 m_rep = &Rep::empty;
477 else
478 m_rep = Rep::createCopying(buffer.data(), buffer.size());
479}
480
481
482UString::UString(const UString &a, const UString &b)
483{
484 int aSize = a.size();
485 int aOffset = a.m_rep->offset;
486 int bSize = b.size();
487 int bOffset = b.m_rep->offset;
488 int length = aSize + bSize;
489
490 // possible cases:
491
492 if (aSize == 0) {
493 // a is empty
494 m_rep = b.m_rep;
495 } else if (bSize == 0) {
496 // b is empty
497 m_rep = a.m_rep;
498 } else if (aOffset + aSize == a.usedCapacity() && aSize >= minShareSize && 4 * aSize >= bSize &&
499 (-bOffset != b.usedPreCapacity() || aSize >= bSize)) {
500 // - a reaches the end of its buffer so it qualifies for shared append
501 // - also, it's at least a quarter the length of b - appending to a much shorter
502 // string does more harm than good
503 // - however, if b qualifies for prepend and is longer than a, we'd rather prepend
504 UString x(a);
505 x.expandCapacity(aOffset + length);
506 if (a.data() && x.data()) {
507 memcpy(const_cast<UChar *>(a.data() + aSize), b.data(), bSize * sizeof(UChar));
508 m_rep = Rep::create(a.m_rep, 0, length);
509 } else
510 m_rep = &Rep::null;
511 } else if (-bOffset == b.usedPreCapacity() && bSize >= minShareSize && 4 * bSize >= aSize) {
512 // - b reaches the beginning of its buffer so it qualifies for shared prepend
513 // - also, it's at least a quarter the length of a - prepending to a much shorter
514 // string does more harm than good
515 UString y(b);
516 y.expandPreCapacity(-bOffset + aSize);
517 if (b.data() && y.data()) {
518 memcpy(const_cast<UChar *>(b.data() - aSize), a.data(), aSize * sizeof(UChar));
519 m_rep = Rep::create(b.m_rep, -aSize, length);
520 } else
521 m_rep = &Rep::null;
522 } else {
523 // a does not qualify for append, and b does not qualify for prepend, gotta make a whole new string
524 size_t newCapacity = expandedSize(length, 0);
525 UChar* d = allocChars(newCapacity);
526 if (!d)
527 m_rep = &Rep::null;
528 else {
529 memcpy(d, a.data(), aSize * sizeof(UChar));
530 memcpy(d + aSize, b.data(), bSize * sizeof(UChar));
531 m_rep = Rep::create(d, length);
532 m_rep->capacity = newCapacity;
533 }
534 }
535}
536
537const UString &UString::null()
538{
539 static UString* n = new UString;
540 return *n;
541}
542
543UString UString::from(int i)
544{
545 UChar buf[1 + sizeof(i) * 3];
546 UChar *end = buf + sizeof(buf) / sizeof(UChar);
547 UChar *p = end;
548
549 if (i == 0) {
550 *--p = '0';
551 } else if (i == INT_MIN) {
552 char minBuf[1 + sizeof(i) * 3];
553 sprintf(minBuf, "%d", INT_MIN);
554 return UString(minBuf);
555 } else {
556 bool negative = false;
557 if (i < 0) {
558 negative = true;
559 i = -i;
560 }
561 while (i) {
562 *--p = (unsigned short)((i % 10) + '0');
563 i /= 10;
564 }
565 if (negative) {
566 *--p = '-';
567 }
568 }
569
570 return UString(p, static_cast<int>(end - p));
571}
572
573UString UString::from(unsigned int u)
574{
575 UChar buf[sizeof(u) * 3];
576 UChar *end = buf + sizeof(buf) / sizeof(UChar);
577 UChar *p = end;
578
579 if (u == 0) {
580 *--p = '0';
581 } else {
582 while (u) {
583 *--p = (unsigned short)((u % 10) + '0');
584 u /= 10;
585 }
586 }
587
588 return UString(p, static_cast<int>(end - p));
589}
590
591UString UString::from(long l)
592{
593 UChar buf[1 + sizeof(l) * 3];
594 UChar *end = buf + sizeof(buf) / sizeof(UChar);
595 UChar *p = end;
596
597 if (l == 0) {
598 *--p = '0';
599 } else if (l == LONG_MIN) {
600 char minBuf[1 + sizeof(l) * 3];
601 sprintf(minBuf, "%ld", LONG_MIN);
602 return UString(minBuf);
603 } else {
604 bool negative = false;
605 if (l < 0) {
606 negative = true;
607 l = -l;
608 }
609 while (l) {
610 *--p = (unsigned short)((l % 10) + '0');
611 l /= 10;
612 }
613 if (negative) {
614 *--p = '-';
615 }
616 }
617
618 return UString(p, static_cast<int>(end - p));
619}
620
621UString UString::from(double d)
622{
623 // avoid ever printing -NaN, in JS conceptually there is only one NaN value
624 if (isNaN(d))
625 return UString("NaN", 3);
626
627 char buf[80];
628 int decimalPoint;
629 int sign;
630
631 char *result = kjs_dtoa(d, 0, 0, &decimalPoint, &sign, NULL);
632 int length = static_cast<int>(strlen(result));
633
634 int i = 0;
635 if (sign) {
636 buf[i++] = '-';
637 }
638
639 if (decimalPoint <= 0 && decimalPoint > -6) {
640 buf[i++] = '0';
641 buf[i++] = '.';
642 for (int j = decimalPoint; j < 0; j++) {
643 buf[i++] = '0';
644 }
645 strcpy(buf + i, result);
646 i += length;
647 } else if (decimalPoint <= 21 && decimalPoint > 0) {
648 if (length <= decimalPoint) {
649 strcpy(buf + i, result);
650 i += length;
651 for (int j = 0; j < decimalPoint - length; j++) {
652 buf[i++] = '0';
653 }
654// buf[i] = '\0';
655 } else {
656 strncpy(buf + i, result, decimalPoint);
657 i += decimalPoint;
658 buf[i++] = '.';
659 strcpy(buf + i, result + decimalPoint);
660 i += length - decimalPoint;
661 }
662 } else if (result[0] < '0' || result[0] > '9') {
663 strcpy(buf + i, result);
664 i += length;
665 } else {
666 buf[i++] = result[0];
667 if (length > 1) {
668 buf[i++] = '.';
669 strcpy(buf + i, result + 1);
670 i += length - 1;
671 }
672
673 buf[i++] = 'e';
674 buf[i++] = (decimalPoint >= 0) ? '+' : '-';
675 // decimalPoint can't be more than 3 digits decimal given the
676 // nature of float representation
677 int exponential = decimalPoint - 1;
678 if (exponential < 0) {
679 exponential = exponential * -1;
680 }
681 if (exponential >= 100) {
682 buf[i++] = '0' + exponential / 100;
683 }
684 if (exponential >= 10) {
685 buf[i++] = '0' + (exponential % 100) / 10;
686 }
687 buf[i++] = '0' + exponential % 10;
688// buf[i++] = '\0';
689 }
690
691 kjs_freedtoa(result);
692
693 return UString(buf, i);
694}
695
696UString UString::spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const
697{
698 if (rangeCount == 1 && separatorCount == 0) {
699 int thisSize = size();
700 int position = substringRanges[0].position;
701 int length = substringRanges[0].length;
702 if (position <= 0 && length >= thisSize)
703 return *this;
704 return UString::Rep::create(m_rep, maxInt(0, position), minInt(thisSize, length));
705 }
706
707 int totalLength = 0;
708 for (int i = 0; i < rangeCount; i++)
709 totalLength += substringRanges[i].length;
710 for (int i = 0; i < separatorCount; i++)
711 totalLength += separators[i].size();
712
713 if (totalLength == 0)
714 return "";
715
716 UChar* buffer = allocChars(totalLength);
717 if (!buffer)
718 return null();
719
720 int maxCount = max(rangeCount, separatorCount);
721 int bufferPos = 0;
722 for (int i = 0; i < maxCount; i++) {
723 if (i < rangeCount) {
724 memcpy(buffer + bufferPos, data() + substringRanges[i].position, substringRanges[i].length * sizeof(UChar));
725 bufferPos += substringRanges[i].length;
726 }
727 if (i < separatorCount) {
728 memcpy(buffer + bufferPos, separators[i].data(), separators[i].size() * sizeof(UChar));
729 bufferPos += separators[i].size();
730 }
731 }
732
733 return UString::Rep::create(buffer, totalLength);
734}
735
736// Append a sub-string of <subStr> to this string.
737// Equivalent to append(subStr.substr(subPos, subLength))
738
739UString& UString::append(const UString& subStr, int subPos, int subLength)
740{
741 int subSize = subStr.size();
742
743 if (subPos < 0)
744 subPos = 0;
745 else if (subPos >= subSize)
746 subPos = subSize;
747 if (subLength < 0)
748 subLength = subSize;
749 if (subPos + subLength >= subSize)
750 subLength = subSize - subPos;
751
752 return append(UString(subStr.data() + subPos, subLength));
753}
754
755UString &UString::append(const UString &t)
756{
757 int thisSize = size();
758 int thisOffset = m_rep->offset;
759 int tSize = t.size();
760 int length = thisSize + tSize;
761
762 // possible cases:
763 if (thisSize == 0) {
764 // this is empty
765 *this = t;
766 } else if (tSize == 0) {
767 // t is empty
768 } else if (m_rep->baseIsSelf() && m_rep->rc == 1) {
769 // this is direct and has refcount of 1 (so we can just alter it directly)
770 expandCapacity(thisOffset + length);
771 if (data()) {
772 memcpy(const_cast<UChar*>(data() + thisSize), t.data(), tSize * sizeof(UChar));
773 m_rep->len = length;
774 m_rep->_hash = 0;
775 }
776 } else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) {
777 // this reaches the end of the buffer - extend it if it's long enough to append to
778 expandCapacity(thisOffset + length);
779 if (data()) {
780 memcpy(const_cast<UChar*>(data() + thisSize), t.data(), tSize * sizeof(UChar));
781 m_rep = Rep::create(m_rep, 0, length);
782 }
783 } else {
784 // this is shared with someone using more capacity, gotta make a whole new string
785 size_t newCapacity = expandedSize(length, 0);
786 UChar* d = allocChars(newCapacity);
787 if (!d)
788 m_rep = &Rep::null;
789 else {
790 memcpy(d, data(), thisSize * sizeof(UChar));
791 memcpy(const_cast<UChar*>(d + thisSize), t.data(), tSize * sizeof(UChar));
792 m_rep = Rep::create(d, length);
793 m_rep->capacity = newCapacity;
794 }
795 }
796
797 return *this;
798}
799
800
801UString &UString::append(const char *t)
802{
803 int thisSize = size();
804 int thisOffset = m_rep->offset;
805 int tSize = static_cast<int>(strlen(t));
806 int length = thisSize + tSize;
807
808 // possible cases:
809 if (thisSize == 0) {
810 // this is empty
811 *this = t;
812 } else if (tSize == 0) {
813 // t is empty, we'll just return *this below.
814 } else if (m_rep->baseIsSelf() && m_rep->rc == 1) {
815 // this is direct and has refcount of 1 (so we can just alter it directly)
816 expandCapacity(thisOffset + length);
817 UChar *d = const_cast<UChar *>(data());
818 if (d) {
819 for (int i = 0; i < tSize; ++i)
820 d[thisSize + i] = t[i];
821 m_rep->len = length;
822 m_rep->_hash = 0;
823 }
824 } else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) {
825 // this string reaches the end of the buffer - extend it
826 expandCapacity(thisOffset + length);
827 UChar *d = const_cast<UChar *>(data());
828 if (d) {
829 for (int i = 0; i < tSize; ++i)
830 d[thisSize + i] = t[i];
831 m_rep = Rep::create(m_rep, 0, length);
832 }
833 } else {
834 // this is shared with someone using more capacity, gotta make a whole new string
835 size_t newCapacity = expandedSize(length, 0);
836 UChar* d = allocChars(newCapacity);
837 if (!d)
838 m_rep = &Rep::null;
839 else {
840 memcpy(d, data(), thisSize * sizeof(UChar));
841 for (int i = 0; i < tSize; ++i)
842 d[thisSize + i] = t[i];
843 m_rep = Rep::create(d, length);
844 m_rep->capacity = newCapacity;
845 }
846 }
847
848 return *this;
849}
850
851UString &UString::append(unsigned short c)
852{
853 int thisOffset = m_rep->offset;
854 int length = size();
855
856 // possible cases:
857 if (length == 0) {
858 // this is empty - must make a new m_rep because we don't want to pollute the shared empty one
859 size_t newCapacity = expandedSize(1, 0);
860 UChar* d = allocChars(newCapacity);
861 if (!d)
862 m_rep = &Rep::null;
863 else {
864 d[0] = c;
865 m_rep = Rep::create(d, 1);
866 m_rep->capacity = newCapacity;
867 }
868 } else if (m_rep->baseIsSelf() && m_rep->rc == 1) {
869 // this is direct and has refcount of 1 (so we can just alter it directly)
870 expandCapacity(thisOffset + length + 1);
871 UChar *d = const_cast<UChar *>(data());
872 if (d) {
873 d[length] = c;
874 m_rep->len = length + 1;
875 m_rep->_hash = 0;
876 }
877 } else if (thisOffset + length == usedCapacity() && length >= minShareSize) {
878 // this reaches the end of the string - extend it and share
879 expandCapacity(thisOffset + length + 1);
880 UChar *d = const_cast<UChar *>(data());
881 if (d) {
882 d[length] = c;
883 m_rep = Rep::create(m_rep, 0, length + 1);
884 }
885 } else {
886 // this is shared with someone using more capacity, gotta make a whole new string
887 size_t newCapacity = expandedSize(length + 1, 0);
888 UChar* d = allocChars(newCapacity);
889 if (!d)
890 m_rep = &Rep::null;
891 else {
892 memcpy(d, data(), length * sizeof(UChar));
893 d[length] = c;
894 m_rep = Rep::create(d, length + 1);
895 m_rep->capacity = newCapacity;
896 }
897 }
898
899 return *this;
900}
901
902CString UString::cstring() const
903{
904 return ascii();
905}
906
907char *UString::ascii() const
908{
909 // Never make the buffer smaller than normalStatBufferSize.
910 // Thus we almost never need to reallocate.
911 int length = size();
912 int neededSize = length + 1;
913 if (neededSize < normalStatBufferSize) {
914 neededSize = normalStatBufferSize;
915 }
916 if (neededSize != statBufferSize) {
917 delete [] statBuffer;
918 statBuffer = new char [neededSize];
919 statBufferSize = neededSize;
920 }
921
922 const UChar *p = data();
923 char *q = statBuffer;
924 const UChar *limit = p + length;
925 while (p != limit) {
926 *q = static_cast<char>(p->uc);
927 ++p;
928 ++q;
929 }
930 *q = '\0';
931
932 return statBuffer;
933}
934
935UString& UString::operator=(Empty)
936{
937 m_rep = &Rep::empty;
938
939 return *this;
940}
941
942UString& UString::operator=(const char* c)
943{
944 set(c, c ? strlen(c) : 0);
945
946 return *this;
947}
948
949void UString::set(const char* c, int l)
950{
951 if (!c) {
952 m_rep = &Rep::null;
953 return;
954 }
955
956 if (l == 0) {
957 m_rep = &Rep::empty;
958 return;
959 }
960
961 UChar *d;
962 if (m_rep->rc == 1 && l <= m_rep->capacity && m_rep->baseIsSelf() && m_rep->offset == 0 && m_rep->preCapacity == 0) {
963 d = m_rep->buf;
964 m_rep->_hash = 0;
965 m_rep->len = l;
966 } else {
967 d = allocChars(l);
968 if (!d) {
969 m_rep = &Rep::null;
970 return;
971 }
972 m_rep = Rep::create(d, l);
973 }
974 for (int i = 0; i < l; i++)
975 d[i].uc = static_cast<unsigned char>(c[i]);
976}
977
978bool UString::is8Bit() const
979{
980 const UChar *u = data();
981 const UChar *limit = u + size();
982 while (u < limit) {
983 if (u->uc > 0xFF)
984 return false;
985 ++u;
986 }
987
988 return true;
989}
990
991const UChar UString::operator[](int pos) const
992{
993 if (pos >= size())
994 return '\0';
995 return data()[pos];
996}
997
998double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const
999{
1000 double d;
1001
1002 const int length = size();
1003 int leadingSpaces = 0;
1004
1005 // skip leading white space
1006 while (leadingSpaces < length && CommonUnicode::isStrWhiteSpace(data()[leadingSpaces].uc))
1007 ++leadingSpaces;
1008
1009 UString whitespaceSkipped = substr(leadingSpaces, length - leadingSpaces);
1010
1011 // FIXME: If tolerateTrailingJunk is true, then we want to tolerate non-8-bit junk
1012 // after the number, so is8Bit is too strict a check.
1013 if (!whitespaceSkipped.is8Bit())
1014 return NaN;
1015
1016 const char *c = whitespaceSkipped.ascii();
1017
1018 // empty string ?
1019 if (*c == '\0')
1020 return tolerateEmptyString ? 0.0 : NaN;
1021
1022 // hex number ?
1023 if (*c == '0' && (*(c+1) == 'x' || *(c+1) == 'X')) {
1024 const char* firstDigitPosition = c + 2;
1025 c++;
1026 d = 0.0;
1027 while (*(++c)) {
1028 if (*c >= '0' && *c <= '9')
1029 d = d * 16.0 + *c - '0';
1030 else if ((*c >= 'A' && *c <= 'F') || (*c >= 'a' && *c <= 'f'))
1031 d = d * 16.0 + (*c & 0xdf) - 'A' + 10.0;
1032 else
1033 break;
1034 }
1035
1036 if (d >= mantissaOverflowLowerBound)
1037 d = parseIntOverflow(firstDigitPosition, c - firstDigitPosition, 16);
1038 } else {
1039 // regular number ?
1040 char *end;
1041 d = kjs_strtod(c, &end);
1042 if ((d != 0.0 || end != c) && d != Inf && d != -Inf) {
1043 c = end;
1044 } else {
1045 double sign = 1.0;
1046
1047 if (*c == '+')
1048 c++;
1049 else if (*c == '-') {
1050 sign = -1.0;
1051 c++;
1052 }
1053
1054 // We used strtod() to do the conversion. However, strtod() handles
1055 // infinite values slightly differently than JavaScript in that it
1056 // converts the string "inf" with any capitalization to infinity,
1057 // whereas the ECMA spec requires that it be converted to NaN.
1058
1059 if (strncmp(c, "Infinity", 8) == 0) {
1060 d = sign * Inf;
1061 c += 8;
1062 } else if ((d == Inf || d == -Inf) && *c != 'I' && *c != 'i')
1063 c = end;
1064 else
1065 return NaN;
1066 }
1067 }
1068
1069 // allow trailing white space
1070 while (isASCIISpace(*c))
1071 c++;
1072 // don't allow anything after - unless tolerant=true
1073 if (!tolerateTrailingJunk && *c != '\0')
1074 d = NaN;
1075
1076 return d;
1077}
1078
1079#ifdef __FAST_MATH__
1080# error "KJS does not work correctly with -ffast-math"
1081#endif
1082
1083double UString::toDouble(bool tolerateTrailingJunk) const
1084{
1085 return toDouble(tolerateTrailingJunk, true);
1086}
1087
1088double UString::toDouble() const
1089{
1090 return toDouble(false, true);
1091}
1092
1093uint32_t UString::toStrictUInt32(bool *ok) const
1094{
1095 if (ok)
1096 *ok = false;
1097
1098 // Empty string is not OK.
1099 int len = m_rep->len;
1100 if (len == 0)
1101 return 0;
1102 const UChar *p = m_rep->data();
1103 unsigned short c = p->unicode();
1104
1105 // If the first digit is 0, only 0 itself is OK.
1106 if (c == '0') {
1107 if (len == 1 && ok)
1108 *ok = true;
1109 return 0;
1110 }
1111
1112 // Convert to UInt32, checking for overflow.
1113 uint32_t i = 0;
1114 while (1) {
1115 // Process character, turning it into a digit.
1116 if (c < '0' || c > '9')
1117 return 0;
1118 const unsigned d = c - '0';
1119
1120 // Multiply by 10, checking for overflow out of 32 bits.
1121 if (i > 0xFFFFFFFFU / 10)
1122 return 0;
1123 i *= 10;
1124
1125 // Add in the digit, checking for overflow out of 32 bits.
1126 const unsigned max = 0xFFFFFFFFU - d;
1127 if (i > max)
1128 return 0;
1129 i += d;
1130
1131 // Handle end of string.
1132 if (--len == 0) {
1133 if (ok)
1134 *ok = true;
1135 return i;
1136 }
1137
1138 // Get next character.
1139 c = (++p)->unicode();
1140 }
1141}
1142
1143int UString::find(const UString &f, int pos) const
1144{
1145 int sz = size();
1146 int fsz = f.size();
1147 if (sz < fsz)
1148 return -1;
1149 if (pos < 0)
1150 pos = 0;
1151 if (fsz == 0)
1152 return pos;
1153 const UChar* data_ = data();
1154 const UChar* end = data_ + sz - fsz;
1155 int fsizeminusone = (fsz - 1) * sizeof(UChar);
1156 const UChar *fdata = f.data();
1157 unsigned short fchar = fdata->uc;
1158 ++fdata;
1159 for (const UChar* c = data_ + pos; c <= end; c++)
1160 if (c->uc == fchar && !memcmp(c + 1, fdata, fsizeminusone))
1161 return (c - data_);
1162
1163 return -1;
1164}
1165
1166int UString::find(UChar ch, int pos) const
1167{
1168 if (pos < 0)
1169 pos = 0;
1170 const UChar* data_ = data();
1171 const UChar *end = data_ + size();
1172 for (const UChar *c = data_ + pos; c < end; c++)
1173 if (*c == ch)
1174 return (c - data_);
1175
1176 return -1;
1177}
1178
1179int UString::rfind(const UString &f, int pos) const
1180{
1181 int sz = size();
1182 int fsz = f.size();
1183 if (sz < fsz)
1184 return -1;
1185 if (pos < 0)
1186 pos = 0;
1187 if (pos > sz - fsz)
1188 pos = sz - fsz;
1189 if (fsz == 0)
1190 return pos;
1191 int fsizeminusone = (fsz - 1) * sizeof(UChar);
1192 const UChar *fdata = f.data();
1193 const UChar* data_ = data();
1194 for (const UChar* c = data_ + pos; c >= data_; c--) {
1195 if (*c == *fdata && !memcmp(c + 1, fdata + 1, fsizeminusone))
1196 return (c - data_);
1197 }
1198
1199 return -1;
1200}
1201
1202int UString::rfind(UChar ch, int pos) const
1203{
1204 if (isEmpty())
1205 return -1;
1206 if (pos + 1 >= size())
1207 pos = size() - 1;
1208 const UChar* data_ = data();
1209 for (const UChar* c = data_ + pos; c >= data_; c--) {
1210 if (*c == ch)
1211 return (c - data_);
1212 }
1213
1214 return -1;
1215}
1216
1217UString UString::substr(int pos, int len) const
1218{
1219 int s = size();
1220
1221 if (pos < 0)
1222 pos = 0;
1223 else if (pos >= s)
1224 pos = s;
1225 if (len < 0)
1226 len = s;
1227 if (pos + len >= s)
1228 len = s - pos;
1229
1230 if (pos == 0 && len == s)
1231 return *this;
1232
1233 return UString(Rep::create(m_rep, pos, len));
1234}
1235
1236void UString::copyForWriting()
1237{
1238 int l = size();
1239 if (!l) return; // Not going to touch anything anyway.
1240 if (m_rep->rc > 1 || !m_rep->baseIsSelf()) {
1241 UChar* n = allocChars(l);
1242 memcpy(n, data(), l * sizeof(UChar));
1243 m_rep = Rep::create(n, l);
1244 }
1245}
1246
1247bool operator==(const UString& s1, const UString& s2)
1248{
1249#if 0
1250 if (s1.m_rep == s2.m_rep)
1251 return true;
1252#endif
1253
1254 if (s1.m_rep->len != s2.m_rep->len)
1255 return false;
1256
1257 return (memcmp(s1.m_rep->data(), s2.m_rep->data(),
1258 s1.m_rep->len * sizeof(UChar)) == 0);
1259}
1260
1261bool operator==(const UString& s1, const char *s2)
1262{
1263 if (s2 == 0) {
1264 return s1.isEmpty();
1265 }
1266
1267 const UChar *u = s1.data();
1268 const UChar *uend = u + s1.size();
1269 while (u != uend && *s2) {
1270 if (u->uc != (unsigned char)*s2)
1271 return false;
1272 s2++;
1273 u++;
1274 }
1275
1276 return u == uend && *s2 == 0;
1277}
1278
1279bool operator<(const UString& s1, const UString& s2)
1280{
1281 const int l1 = s1.size();
1282 const int l2 = s2.size();
1283 const int lmin = l1 < l2 ? l1 : l2;
1284 const UChar *c1 = s1.data();
1285 const UChar *c2 = s2.data();
1286 int l = 0;
1287 while (l < lmin && *c1 == *c2) {
1288 c1++;
1289 c2++;
1290 l++;
1291 }
1292 if (l < lmin)
1293 return (c1->uc < c2->uc);
1294
1295 return (l1 < l2);
1296}
1297
1298bool UString::equal(const UString::Rep *r, const UString::Rep *b)
1299{
1300 if (r == b)
1301 return true;
1302
1303 int length = r->len;
1304 if (length != b->len)
1305 return false;
1306
1307 const UChar *d = r->data();
1308 const UChar *s = b->data();
1309 for (int i = 0; i != length; ++i)
1310 if (d[i].uc != s[i].uc)
1311 return false;
1312 return true;
1313}
1314
1315
1316int compare(const UString& s1, const UString& s2)
1317{
1318 const int l1 = s1.size();
1319 const int l2 = s2.size();
1320 const int lmin = l1 < l2 ? l1 : l2;
1321 const UChar *c1 = s1.data();
1322 const UChar *c2 = s2.data();
1323 int l = 0;
1324 while (l < lmin && *c1 == *c2) {
1325 c1++;
1326 c2++;
1327 l++;
1328 }
1329
1330 if (l < lmin)
1331 return (c1->uc > c2->uc) ? 1 : -1;
1332
1333 if (l1 == l2)
1334 return 0;
1335
1336 return (l1 > l2) ? 1 : -1;
1337}
1338
1339inline int inlineUTF8SequenceLengthNonASCII(char b0)
1340{
1341 if ((b0 & 0xC0) != 0xC0)
1342 return 0;
1343 if ((b0 & 0xE0) == 0xC0)
1344 return 2;
1345 if ((b0 & 0xF0) == 0xE0)
1346 return 3;
1347 if ((b0 & 0xF8) == 0xF0)
1348 return 4;
1349 return 0;
1350}
1351
1352int UTF8SequenceLengthNonASCII(char b0)
1353{
1354 return inlineUTF8SequenceLengthNonASCII(b0);
1355}
1356
1357inline int inlineUTF8SequenceLength(char b0)
1358{
1359 return (b0 & 0x80) == 0 ? 1 : UTF8SequenceLengthNonASCII(b0);
1360}
1361
1362// Given a first byte, gives the length of the UTF-8 sequence it begins.
1363// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
1364// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
1365int UTF8SequenceLength(char b0)
1366{
1367 return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
1368}
1369
1370// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
1371// Only allows Unicode characters (U-00000000 to U-0010FFFF).
1372// Returns -1 if the sequence is not valid (including presence of extra bytes).
1373int decodeUTF8Sequence(const char *sequence)
1374{
1375 // Handle 0-byte sequences (never valid).
1376 const unsigned char b0 = sequence[0];
1377 const int length = inlineUTF8SequenceLength(b0);
1378 if (length == 0)
1379 return -1;
1380
1381 // Handle 1-byte sequences (plain ASCII).
1382 const unsigned char b1 = sequence[1];
1383 if (length == 1) {
1384 if (b1)
1385 return -1;
1386 return b0;
1387 }
1388
1389 // Handle 2-byte sequences.
1390 if ((b1 & 0xC0) != 0x80)
1391 return -1;
1392 const unsigned char b2 = sequence[2];
1393 if (length == 2) {
1394 if (b2)
1395 return -1;
1396 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
1397 if (c < 0x80)
1398 return -1;
1399 return c;
1400 }
1401
1402 // Handle 3-byte sequences.
1403 if ((b2 & 0xC0) != 0x80)
1404 return -1;
1405 const unsigned char b3 = sequence[3];
1406 if (length == 3) {
1407 if (b3)
1408 return -1;
1409 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
1410 if (c < 0x800)
1411 return -1;
1412 // UTF-16 surrogates should never appear in UTF-8 data.
1413 if (c >= 0xD800 && c <= 0xDFFF)
1414 return -1;
1415 // Backwards BOM and U+FFFF should never appear in UTF-8 data.
1416 if (c == 0xFFFE || c == 0xFFFF)
1417 return -1;
1418 return c;
1419 }
1420
1421 // Handle 4-byte sequences.
1422 if ((b3 & 0xC0) != 0x80)
1423 return -1;
1424 const unsigned char b4 = sequence[4];
1425 if (length == 4) {
1426 if (b4)
1427 return -1;
1428 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
1429 if (c < 0x10000 || c > 0x10FFFF)
1430 return -1;
1431 return c;
1432 }
1433
1434 return -1;
1435}
1436
1437CString UString::UTF8String() const
1438{
1439 // Allocate a buffer big enough to hold all the characters.
1440 const int length = size();
1441 Vector<char, 1024> buffer(length * 3);
1442
1443 // Convert to runs of 8-bit characters.
1444 char *p = buffer.begin();
1445 const unsigned short* d = &data()->uc;
1446 for (int i = 0; i != length; ++i) {
1447 unsigned int c = d[i], sc;
1448 if (c < 0x80) {
1449 *p++ = (char)c;
1450 } else if (c < 0x800) {
1451 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
1452 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1453 } else if (c >= 0xD800 && c <= 0xDBFF && (i+1) < length &&
1454 (sc = d[i+1]) >= 0xDC00 && sc <= 0xDFFF) {
1455 sc = 0x10000 + (((c & 0x3FF) << 10) | (sc & 0x3FF));
1456 *p++ = (char)((sc >> 18) | 0xF0); // F0 is the 4-byte flag for UTF-8
1457 *p++ = (char)(((sc >> 12) | 0x80) & 0xBF); // next 6 bits, with high bit set
1458 *p++ = (char)(((sc >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1459 *p++ = (char)((sc | 0x80) & 0xBF); // next 6 bits, with high bit set
1460 ++i;
1461 } else {
1462 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
1463 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
1464 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
1465 }
1466 }
1467
1468 // Return the result as a C string.
1469 CString result(buffer.data(), p - buffer.data());
1470
1471 return result;
1472}
1473
1474} // namespace KJS
1475