ustring.cpp [kdelibs/kjs/ustring.cpp]

1	// -- c-basic-offset: 2 --
2	/*
3	* This file is part of the KDE libraries
4	* Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
5	* Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved.
6	* Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
7	*
8	* This library is free software; you can redistribute it and/or
9	* modify it under the terms of the GNU Library General Public
10	* License as published by the Free Software Foundation; either
11	* version 2 of the License, or (at your option) any later version.
12	*
13	* This library is distributed in the hope that it will be useful,
14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16	* Library General Public License for more details.
17	*
18	* You should have received a copy of the GNU Library General Public License
19	* along with this library; see the file COPYING.LIB. If not, write to
20	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21	* Boston, MA 02110-1301, USA.
22	*
23	*/
24
25	#include "ustring.h"
26	#include <config-kjs.h>
27
28	#include <assert.h>
29	#include <stdlib.h>
30	#include <stdio.h>
31	#include "wtf/DisallowCType.h"
32	#include "wtf/ASCIICType.h"
33	#if HAVE(STRING_H)
34	#include <string.h>
35	#endif
36	#if HAVE(STRINGS_H)
37	#include <strings.h>
38	#endif
39	#include <limits.h>
40
41	#include "operations.h"
42	#include "function.h"
43	#include "identifier.h"
44	#include <math.h>
45	#include "dtoa.h"
46	#include "collector.h"
47	#include "commonunicode.h"
48
49	#include <wtf/Vector.h>
50
51	using std::max;
52
53	// GCC cstring uses these automatically, but not all implementations do.
54	using std::strlen;
55	using std::strcpy;
56	using std::strncpy;
57	using std::memset;
58	using std::memcpy;
59
60	using namespace WTF;
61
62	namespace KJS {
63
64	extern const double NaN;
65	extern const double Inf;
66
67	static inline size_t overflowIndicator() { return std::numeric_limits<size_t>::max(); }
68	static inline size_t maxUChars() { return std::numeric_limits<size_t>::max() / sizeof(UChar); }
69
70	static inline UChar* allocChars(size_t length)
71	{
72	assert(length);
73	if (length > maxUChars())
74	return `0`;
75	return static_cast<UChar>(fastMalloc(sizeof(UChar) length));
76	}
77
78	static inline UChar* reallocChars(UChar* buffer, size_t length)
79	{
80	ASSERT(length);
81	if (length > maxUChars())
82	return `0`;
83	return static_cast<UChar>(fastRealloc(buffer, sizeof(UChar) length));
84	}
85
86	CString::CString(const char *c)
87	{
88	length = strlen(c);
89	data = new char[length+`1`];
90	memcpy(data, c, length + `1`);
91	}
92
93	CString::CString(const char *c, size_t len)
94	{
95	length = len;
96	data = new char[len+`1`];
97	memcpy(data, c, len);
98	data[len] = `0`;
99	}
100
101	CString::CString(const CString &b)
102	{
103	length = b.length;
104	if (length > `0` && b.data) {
105	data = new char[length+`1`];
106	memcpy(data, b.data, length + `1`);
107	}
108	else
109	data = `0`;
110	}
111
112	CString::~CString()
113	{
114	delete [] data;
115	}
116
117	CString &CString::operator=(const char *c)
118	{
119	if (data)
120	delete [] data;
121	length = strlen(c);
122	data = new char[length+`1`];
123	memcpy(data, c, length + `1`);
124
125	return *this;
126	}
127
128	CString &CString::operator=(const CString &str)
129	{
130	if (this == &str)
131	return *this;
132
133	if (data)
134	delete [] data;
135	length = str.length;
136	if (str.data) {
137	data = new char[length + `1`];
138	memcpy(data, str.data, length + `1`);
139	}
140	else
141	data = `0`;
142
143	return *this;
144	}
145
146	bool operator==(const CString& c1, const CString& c2)
147	{
148	size_t len = c1.size();
149	return len == c2.size() && (len == `0` \|\| memcmp(c1.c_str(), c2.c_str(), len) == `0`);
150	}
151
152	// Hack here to avoid a global with a constructor; point to an unsigned short instead of a UChar.
153	static unsigned short almostUChar;
154	UString::Rep UString::Rep::null = { `0`, `0`, `1`, `0`, `0`, &UString::Rep::null, `0`, `0`, `0`, `0`, `0`, `0` };
155	UString::Rep UString::Rep::empty = { `0`, `0`, `1`, `0`, `0`, &UString::Rep::empty, `0`, reinterpret_cast<UChar*>(&almostUChar), `0`, `0`, `0`, `0` };
156	const int normalStatBufferSize = `4096`;
157	static char statBuffer = `0`; // FIXME: This buffer is never deallocated.*
158	static int statBufferSize = `0`;
159
160	PassRefPtr<UString::Rep> UString::Rep::createCopying (const UChar* d, int length)
161	{
162	UChar* copyD = allocChars(length);
163	memcpy(copyD, d, length * sizeof(UChar));
164
165	return create(copyD, length);
166	}
167
168	PassRefPtr<UString::Rep> UString::Rep::create(UChar d, int* l)
169	{
170	Rep* r = new Rep;
171	r->offset = `0`;
172	r->len = l;
173	r->rc = `1`;
174	r->_hash = `0`;
175	r->isIdentifier = `0`;
176	r->baseString = r;
177	r->reportedCost = `0`;
178	r->buf = d;
179	r->usedCapacity = l;
180	r->capacity = l;
181	r->usedPreCapacity = `0`;
182	r->preCapacity = `0`;
183
184	// steal the single reference this Rep was created with
185	return adoptRef(r);
186	}
187
188	PassRefPtr<UString::Rep> UString::Rep::create(PassRefPtr<Rep> base, int offset, int length)
189	{
190	assert(base);
191
192	int baseOffset = base ->offset;
193
194	base = base ->baseString;
195
196	assert(-(offset + baseOffset) <= base->usedPreCapacity);
197	assert(offset + baseOffset + length <= base->usedCapacity);
198
199	Rep* r = new Rep;
200	r->offset = baseOffset + offset;
201	r->len = length;
202	r->rc = `1`;
203	r->_hash = `0`;
204	r->isIdentifier = `0`;
205	r->baseString = base.releaseRef();
206	r->reportedCost = `0`;
207	r->buf = `0`;
208	r->usedCapacity = `0`;
209	r->capacity = `0`;
210	r->usedPreCapacity = `0`;
211	r->preCapacity = `0`;
212
213	// steal the single reference this Rep was created with
214	return adoptRef(r);
215	}
216
217	void UString::Rep::destroy()
218	{
219	if (isIdentifier)
220	Identifier::remove(this);
221	if (baseString != this) {
222	baseString->deref();
223	} else {
224	fastFree(buf);
225	}
226	delete this;
227	}
228
229	// Golden ratio - arbitrary start value to avoid mapping all 0's to all 0's
230	// or anything like that.
231	const unsigned PHI = `0x9e3779b9U`;
232
233	// Paul Hsieh's SuperFastHash
234	// http://www.azillionmonkeys.com/qed/hash.html
235	unsigned UString::Rep::computeHash(const UChar s, int* len)
236	{
237	unsigned l = len;
238	uint32_t hash = PHI;
239	uint32_t tmp;
240
241	int rem = l & `1`;
242	l >>= `1`;
243
244	// Main loop
245	for (; l > `0`; l--) {
246	hash += s[`0`].uc;
247	tmp = (s[`1`].uc << `11`) ^ hash;
248	hash = (hash << `16`) ^ tmp;
249	s += `2`;
250	hash += hash >> `11`;
251	}
252
253	// Handle end case
254	if (rem) {
255	hash += s[`0`].uc;
256	hash ^= hash << `11`;
257	hash += hash >> `17`;
258	}
259
260	// Force "avalanching" of final 127 bits
261	hash ^= hash << `3`;
262	hash += hash >> `5`;
263	hash ^= hash << `2`;
264	hash += hash >> `15`;
265	hash ^= hash << `10`;
266
267	// this avoids ever returning a hash code of 0, since that is used to
268	// signal "hash not computed yet", using a value that is likely to be
269	// effectively the same as 0 when the low bits are masked
270	if (hash == `0`)
271	hash = `0x80000000`;
272
273	return hash;
274	}
275
276	// Paul Hsieh's SuperFastHash
277	// http://www.azillionmonkeys.com/qed/hash.html
278	unsigned UString::Rep::computeHash(const char* s, int len)
279	{
280	// This hash is designed to work on 16-bit chunks at a time. But since the normal case
281	// (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
282	// were 16-bit chunks, which should give matching results
283
284	uint32_t hash = PHI;
285	uint32_t tmp;
286	unsigned l = len;
287
288	int rem = l & `1`;
289	l >>= `1`;
290
291	// Main loop
292	for (; l > `0`; l--) {
293	hash += (unsigned char)s[`0`];
294	tmp = ((unsigned char)s[`1`] << `11`) ^ hash;
295	hash = (hash << `16`) ^ tmp;
296	s += `2`;
297	hash += hash >> `11`;
298	}
299
300	// Handle end case
301	if (rem) {
302	hash += (unsigned char)s[`0`];
303	hash ^= hash << `11`;
304	hash += hash >> `17`;
305	}
306
307	// Force "avalanching" of final 127 bits
308	hash ^= hash << `3`;
309	hash += hash >> `5`;
310	hash ^= hash << `2`;
311	hash += hash >> `15`;
312	hash ^= hash << `10`;
313
314	// this avoids ever returning a hash code of 0, since that is used to
315	// signal "hash not computed yet", using a value that is likely to be
316	// effectively the same as 0 when the low bits are masked
317	if (hash == `0`)
318	hash = `0x80000000`;
319
320	return hash;
321	}
322
323	unsigned UString::Rep::computeHash(const char* s)
324	{
325	return computeHash(s, strlen(s));
326	}
327
328	// put these early so they can be inlined
329	inline size_t UString::expandedSize(size_t size, size_t otherSize) const
330	{
331	// Do the size calculation in two parts, returning overflowIndicator if
332	// we overflow the maximum value that we can handle.
333
334	if (size > maxUChars())
335	return overflowIndicator();
336
337	size_t expandedSize = ((size + `10`) / `10` * `11`) + `1`;
338	if (maxUChars() - expandedSize < otherSize)
339	return overflowIndicator();
340
341	return expandedSize + otherSize;
342	}
343
344	inline int UString::usedCapacity() const
345	{
346	return m_rep ->baseString->usedCapacity;
347	}
348
349	inline int UString::usedPreCapacity() const
350	{
351	return m_rep ->baseString->usedPreCapacity;
352	}
353
354	void UString::expandCapacity(int requiredLength)
355	{
356	Rep* r = m_rep ->baseString;
357
358	if (requiredLength > r->capacity) {
359	size_t newCapacity = expandedSize(requiredLength, r->preCapacity);
360	UChar* oldBuf = r->buf;
361	r->buf = reallocChars(r->buf, newCapacity);
362	if (!r->buf) {
363	r->buf = oldBuf;
364	m_rep = &Rep::null;
365	return;
366	}
367	r->capacity = newCapacity - r->preCapacity;
368	}
369	if (requiredLength > r->usedCapacity) {
370	r->usedCapacity = requiredLength;
371	}
372	}
373
374	void UString::expandPreCapacity(int requiredPreCap)
375	{
376	Rep* r = m_rep ->baseString;
377
378	if (requiredPreCap > r->preCapacity) {
379	size_t newCapacity = expandedSize(requiredPreCap, r->capacity);
380	int delta = newCapacity - r->capacity - r->preCapacity;
381
382	UChar* newBuf = allocChars(newCapacity);
383	if (!newBuf) {
384	m_rep = &Rep::null;
385	return;
386	}
387	memcpy(newBuf + delta, r->buf, (r->capacity + r->preCapacity) * sizeof(UChar));
388	fastFree(r->buf);
389	r->buf = newBuf;
390
391	r->preCapacity = newCapacity - r->capacity;
392	}
393	if (requiredPreCap > r->usedPreCapacity) {
394	r->usedPreCapacity = requiredPreCap;
395	}
396	}
397
398
399	UString::UString(Empty)
400	: m_rep (&Rep::empty)
401	{
402	}
403
404	UString::UString(char c)
405	: m_rep (Rep::create(allocChars(`1`), `1`))
406	{
407	m_rep ->buf[`0`] = static_cast<unsigned char>(c);
408	}
409
410	UString::UString(const char* c)
411	{
412	if (!c) {
413	m_rep = &Rep::null;
414	return;
415	}
416
417	if (!c[`0`]) {
418	m_rep = &Rep::empty;
419	return;
420	}
421
422	size_t length = strlen(c);
423	UChar *d = allocChars(length);
424	if (!d)
425	m_rep = &Rep::null;
426	else {
427	for (size_t i = `0`; i < length; i++)
428	d[i].uc = c[i];
429	m_rep = Rep::create(d, static_cast<int>(length));
430	}
431	}
432
433	UString::UString(const char* c, size_t length)
434	{
435	if (!c) {
436	m_rep = &Rep::null;
437	return;
438	}
439
440	if (length == `0`) {
441	m_rep = &Rep::empty;
442	return;
443	}
444
445	UChar* d = allocChars(length);
446	if (!d)
447	m_rep = &Rep::null;
448	else {
449	for (size_t i = `0`; i < length; i++)
450	d[i].uc = c[i];
451	m_rep = Rep::create(d, static_cast<int>(length));
452	}
453	}
454
455	UString::UString(const UChar* c, int length)
456	{
457	if (length == `0`)
458	m_rep = &Rep::empty;
459	else
460	m_rep = Rep::createCopying(c, length);
461	}
462
463	UString::UString(UChar* c, int length, bool copy)
464	{
465	if (length == `0`)
466	m_rep = &Rep::empty;
467	else if (copy)
468	m_rep = Rep::createCopying(c, length);
469	else
470	m_rep = Rep::create(c, length);
471	}
472
473	UString::UString(const Vector<UChar>& buffer)
474	{
475	if (!buffer.size())
476	m_rep = &Rep::empty;
477	else
478	m_rep = Rep::createCopying(buffer.data(), buffer.size());
479	}
480
481
482	UString::UString(const UString &a, const UString &b)
483	{
484	int aSize = a.size();
485	int aOffset = a.m_rep ->offset;
486	int bSize = b.size();
487	int bOffset = b.m_rep ->offset;
488	int length = aSize + bSize;
489
490	// possible cases:
491
492	if (aSize == `0`) {
493	// a is empty
494	m_rep = b.m_rep;
495	} else if (bSize == `0`) {
496	// b is empty
497	m_rep = a.m_rep;
498	} else if (aOffset + aSize == a.usedCapacity() && aSize >= minShareSize && `4` * aSize >= bSize &&
499	(-bOffset != b.usedPreCapacity() \|\| aSize >= bSize)) {
500	// - a reaches the end of its buffer so it qualifies for shared append
501	// - also, it's at least a quarter the length of b - appending to a much shorter
502	// string does more harm than good
503	// - however, if b qualifies for prepend and is longer than a, we'd rather prepend
504	UString x(a);
505	x.expandCapacity(aOffset + length);
506	if (a.data() && x.data()) {
507	memcpy(const_cast<UChar >(a.data() + aSize), b.data(), bSize sizeof(UChar));
508	m_rep = Rep::create(a.m_rep, `0`, length);
509	} else
510	m_rep = &Rep::null;
511	} else if (-bOffset == b.usedPreCapacity() && bSize >= minShareSize && `4` * bSize >= aSize) {
512	// - b reaches the beginning of its buffer so it qualifies for shared prepend
513	// - also, it's at least a quarter the length of a - prepending to a much shorter
514	// string does more harm than good
515	UString y(b);
516	y.expandPreCapacity(-bOffset + aSize);
517	if (b.data() && y.data()) {
518	memcpy(const_cast<UChar >(b.data() - aSize), a.data(), aSize sizeof(UChar));
519	m_rep = Rep::create(b.m_rep, -aSize, length);
520	} else
521	m_rep = &Rep::null;
522	} else {
523	// a does not qualify for append, and b does not qualify for prepend, gotta make a whole new string
524	size_t newCapacity = expandedSize(length, `0`);
525	UChar* d = allocChars(newCapacity);
526	if (!d)
527	m_rep = &Rep::null;
528	else {
529	memcpy(d, a.data(), aSize * sizeof(UChar));
530	memcpy(d + aSize, b.data(), bSize * sizeof(UChar));
531	m_rep = Rep::create(d, length);
532	m_rep ->capacity = newCapacity;
533	}
534	}
535	}
536
537	const UString &UString::null()
538	{
539	static UString* n = new UString;
540	return *n;
541	}
542
543	UString UString::from(int i)
544	{
545	UChar buf[`1` + sizeof(i) * `3`];
546	UChar end = buf + sizeof(buf) / sizeof*(UChar);
547	UChar *p = end;
548
549	if (i == `0`) {
550	*--p = '0';
551	} else if (i == INT_MIN) {
552	char minBuf[`1` + sizeof(i) * `3`];
553	sprintf(minBuf, "%d", INT_MIN);
554	return UString(minBuf);
555	} else {
556	bool negative = false;
557	if (i < `0`) {
558	negative = true;
559	i = -i;
560	}
561	while (i) {
562	--p = (unsigned* short)((i % `10`) + '0');
563	i /= `10`;
564	}
565	if (negative) {
566	*--p = '-';
567	}
568	}
569
570	return UString (p, static_cast<int>(end - p));
571	}
572
573	UString UString::from(unsigned int u)
574	{
575	UChar buf[sizeof(u) * `3`];
576	UChar end = buf + sizeof(buf) / sizeof*(UChar);
577	UChar *p = end;
578
579	if (u == `0`) {
580	*--p = '0';
581	} else {
582	while (u) {
583	--p = (unsigned* short)((u % `10`) + '0');
584	u /= `10`;
585	}
586	}
587
588	return UString (p, static_cast<int>(end - p));
589	}
590
591	UString UString::from(long l)
592	{
593	UChar buf[`1` + sizeof(l) * `3`];
594	UChar end = buf + sizeof(buf) / sizeof*(UChar);
595	UChar *p = end;
596
597	if (l == `0`) {
598	*--p = '0';
599	} else if (l == LONG_MIN) {
600	char minBuf[`1` + sizeof(l) * `3`];
601	sprintf(minBuf, "%ld", LONG_MIN);
602	return UString(minBuf);
603	} else {
604	bool negative = false;
605	if (l < `0`) {
606	negative = true;
607	l = -l;
608	}
609	while (l) {
610	--p = (unsigned* short)((l % `10`) + '0');
611	l /= `10`;
612	}
613	if (negative) {
614	*--p = '-';
615	}
616	}
617
618	return UString (p, static_cast<int>(end - p));
619	}
620
621	UString UString::from(double d)
622	{
623	// avoid ever printing -NaN, in JS conceptually there is only one NaN value
624	if (isNaN(d))
625	return UString ("NaN", `3`);
626
627	char buf[`80`];
628	int decimalPoint;
629	int sign;
630
631	char *result = kjs_dtoa(d, `0`, `0`, &decimalPoint, &sign, NULL);
632	int length = static_cast<int>(strlen(result));
633
634	int i = `0`;
635	if (sign) {
636	buf[i++] = '-';
637	}
638
639	if (decimalPoint <= `0` && decimalPoint > -`6`) {
640	buf[i++] = '0';
641	buf[i++] = '.';
642	for (int j = decimalPoint; j < `0`; j++) {
643	buf[i++] = '0';
644	}
645	strcpy(buf + i, result);
646	i += length;
647	} else if (decimalPoint <= `21` && decimalPoint > `0`) {
648	if (length <= decimalPoint) {
649	strcpy(buf + i, result);
650	i += length;
651	for (int j = `0`; j < decimalPoint - length; j++) {
652	buf[i++] = '0';
653	}
654	// buf[i] = '\0';
655	} else {
656	strncpy(buf + i, result, decimalPoint);
657	i += decimalPoint;
658	buf[i++] = '.';
659	strcpy(buf + i, result + decimalPoint);
660	i += length - decimalPoint;
661	}
662	} else if (result[`0`] < '0' \|\| result[`0`] > '9') {
663	strcpy(buf + i, result);
664	i += length;
665	} else {
666	buf[i++] = result[`0`];
667	if (length > `1`) {
668	buf[i++] = '.';
669	strcpy(buf + i, result + `1`);
670	i += length - `1`;
671	}
672
673	buf[i++] = 'e';
674	buf[i++] = (decimalPoint >= `0`) ? '+' : '-';
675	// decimalPoint can't be more than 3 digits decimal given the
676	// nature of float representation
677	int exponential = decimalPoint - `1`;
678	if (exponential < `0`) {
679	exponential = exponential * -`1`;
680	}
681	if (exponential >= `100`) {
682	buf[i++] = '0' + exponential / `100`;
683	}
684	if (exponential >= `10`) {
685	buf[i++] = '0' + (exponential % `100`) / `10`;
686	}
687	buf[i++] = '0' + exponential % `10`;
688	// buf[i++] = '\0';
689	}
690
691	kjs_freedtoa(result);
692
693	return UString (buf, i);
694	}
695
696	UString UString::spliceSubstringsWithSeparators(const Range substringRanges, int* rangeCount, const UString separators, int* separatorCount) const
697	{
698	if (rangeCount == `1` && separatorCount == `0`) {
699	int thisSize = size();
700	int position = substringRanges[`0`].position;
701	int length = substringRanges[`0`].length;
702	if (position <= `0` && length >= thisSize)
703	return *this;
704	return UString::Rep::create(m_rep, maxInt(`0`, position), minInt(thisSize, length));
705	}
706
707	int totalLength = `0`;
708	for (int i = `0`; i < rangeCount; i++)
709	totalLength += substringRanges[i].length;
710	for (int i = `0`; i < separatorCount; i++)
711	totalLength += separators[i].size();
712
713	if (totalLength == `0`)
714	return "";
715
716	UChar* buffer = allocChars(totalLength);
717	if (!buffer)
718	return null();
719
720	int maxCount = max(rangeCount, separatorCount);
721	int bufferPos = `0`;
722	for (int i = `0`; i < maxCount; i++) {
723	if (i < rangeCount) {
724	memcpy(buffer + bufferPos, data() + substringRanges[i].position, substringRanges[i].length * sizeof(UChar));
725	bufferPos += substringRanges[i].length;
726	}
727	if (i < separatorCount) {
728	memcpy(buffer + bufferPos, separators[i].data(), separators[i].size() * sizeof(UChar));
729	bufferPos += separators[i].size();
730	}
731	}
732
733	return UString::Rep::create(buffer, totalLength);
734	}
735
736	// Append a sub-string of <subStr> to this string.
737	// Equivalent to append(subStr.substr(subPos, subLength))
738
739	UString& UString::append(const UString& subStr, int subPos, int subLength)
740	{
741	int subSize = subStr.size();
742
743	if (subPos < `0`)
744	subPos = `0`;
745	else if (subPos >= subSize)
746	subPos = subSize;
747	if (subLength < `0`)
748	subLength = subSize;
749	if (subPos + subLength >= subSize)
750	subLength = subSize - subPos;
751
752	return append(UString (subStr.data() + subPos, subLength));
753	}
754
755	UString &UString::append(const UString &t)
756	{
757	int thisSize = size();
758	int thisOffset = m_rep ->offset;
759	int tSize = t.size();
760	int length = thisSize + tSize;
761
762	// possible cases:
763	if (thisSize == `0`) {
764	// this is empty
765	*this = t;
766	} else if (tSize == `0`) {
767	// t is empty
768	} else if (m_rep ->baseIsSelf() && m_rep ->rc == `1`) {
769	// this is direct and has refcount of 1 (so we can just alter it directly)
770	expandCapacity(thisOffset + length);
771	if (data()) {
772	memcpy(const_cast<UChar>(data() + thisSize), t.data(), tSize sizeof(UChar));
773	m_rep ->len = length;
774	m_rep ->_hash = `0`;
775	}
776	} else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) {
777	// this reaches the end of the buffer - extend it if it's long enough to append to
778	expandCapacity(thisOffset + length);
779	if (data()) {
780	memcpy(const_cast<UChar>(data() + thisSize), t.data(), tSize sizeof(UChar));
781	m_rep = Rep::create(m_rep, `0`, length);
782	}
783	} else {
784	// this is shared with someone using more capacity, gotta make a whole new string
785	size_t newCapacity = expandedSize(length, `0`);
786	UChar* d = allocChars(newCapacity);
787	if (!d)
788	m_rep = &Rep::null;
789	else {
790	memcpy(d, data(), thisSize * sizeof(UChar));
791	memcpy(const_cast<UChar>(d + thisSize), t.data(), tSize sizeof(UChar));
792	m_rep = Rep::create(d, length);
793	m_rep ->capacity = newCapacity;
794	}
795	}
796
797	return *this;
798	}
799
800
801	UString &UString::append(const char *t)
802	{
803	int thisSize = size();
804	int thisOffset = m_rep ->offset;
805	int tSize = static_cast<int>(strlen(t));
806	int length = thisSize + tSize;
807
808	// possible cases:
809	if (thisSize == `0`) {
810	// this is empty
811	*this = t;
812	} else if (tSize == `0`) {
813	// t is empty, we'll just return this below.*
814	} else if (m_rep ->baseIsSelf() && m_rep ->rc == `1`) {
815	// this is direct and has refcount of 1 (so we can just alter it directly)
816	expandCapacity(thisOffset + length);
817	UChar d = const_cast<UChar >(data());
818	if (d) {
819	for (int i = `0`; i < tSize; ++i)
820	d[thisSize + i] = t[i];
821	m_rep ->len = length;
822	m_rep ->_hash = `0`;
823	}
824	} else if (thisOffset + thisSize == usedCapacity() && thisSize >= minShareSize) {
825	// this string reaches the end of the buffer - extend it
826	expandCapacity(thisOffset + length);
827	UChar d = const_cast<UChar >(data());
828	if (d) {
829	for (int i = `0`; i < tSize; ++i)
830	d[thisSize + i] = t[i];
831	m_rep = Rep::create(m_rep, `0`, length);
832	}
833	} else {
834	// this is shared with someone using more capacity, gotta make a whole new string
835	size_t newCapacity = expandedSize(length, `0`);
836	UChar* d = allocChars(newCapacity);
837	if (!d)
838	m_rep = &Rep::null;
839	else {
840	memcpy(d, data(), thisSize * sizeof(UChar));
841	for (int i = `0`; i < tSize; ++i)
842	d[thisSize + i] = t[i];
843	m_rep = Rep::create(d, length);
844	m_rep ->capacity = newCapacity;
845	}
846	}
847
848	return *this;
849	}
850
851	UString &UString::append(unsigned short c)
852	{
853	int thisOffset = m_rep ->offset;
854	int length = size();
855
856	// possible cases:
857	if (length == `0`) {
858	// this is empty - must make a new m_rep because we don't want to pollute the shared empty one
859	size_t newCapacity = expandedSize(`1`, `0`);
860	UChar* d = allocChars(newCapacity);
861	if (!d)
862	m_rep = &Rep::null;
863	else {
864	d[`0`] = c;
865	m_rep = Rep::create(d, `1`);
866	m_rep ->capacity = newCapacity;
867	}
868	} else if (m_rep ->baseIsSelf() && m_rep ->rc == `1`) {
869	// this is direct and has refcount of 1 (so we can just alter it directly)
870	expandCapacity(thisOffset + length + `1`);
871	UChar d = const_cast<UChar >(data());
872	if (d) {
873	d[length] = c;
874	m_rep ->len = length + `1`;
875	m_rep ->_hash = `0`;
876	}
877	} else if (thisOffset + length == usedCapacity() && length >= minShareSize) {
878	// this reaches the end of the string - extend it and share
879	expandCapacity(thisOffset + length + `1`);
880	UChar d = const_cast<UChar >(data());
881	if (d) {
882	d[length] = c;
883	m_rep = Rep::create(m_rep, `0`, length + `1`);
884	}
885	} else {
886	// this is shared with someone using more capacity, gotta make a whole new string
887	size_t newCapacity = expandedSize(length + `1`, `0`);
888	UChar* d = allocChars(newCapacity);
889	if (!d)
890	m_rep = &Rep::null;
891	else {
892	memcpy(d, data(), length * sizeof(UChar));
893	d[length] = c;
894	m_rep = Rep::create(d, length + `1`);
895	m_rep ->capacity = newCapacity;
896	}
897	}
898
899	return *this;
900	}
901
902	CString UString::cstring() const
903	{
904	return ascii();
905	}
906
907	char UString::ascii() const*
908	{
909	// Never make the buffer smaller than normalStatBufferSize.
910	// Thus we almost never need to reallocate.
911	int length = size();
912	int neededSize = length + `1`;
913	if (neededSize < normalStatBufferSize) {
914	neededSize = normalStatBufferSize;
915	}
916	if (neededSize != statBufferSize) {
917	delete [] statBuffer;
918	statBuffer = new char [neededSize];
919	statBufferSize = neededSize;
920	}
921
922	const UChar *p = data();
923	char *q = statBuffer;
924	const UChar *limit = p + length;
925	while (p != limit) {
926	q = static_cast<char*>(p->uc);
927	++p;
928	++q;
929	}
930	*q = '\0';
931
932	return statBuffer;
933	}
934
935	UString& UString::operator=(Empty)
936	{
937	m_rep = &Rep::empty;
938
939	return *this;
940	}
941
942	UString& UString::operator=(const char* c)
943	{
944	set(c, c ? strlen(c) : `0`);
945
946	return *this;
947	}
948
949	void UString::set(const char* c, int l)
950	{
951	if (!c) {
952	m_rep = &Rep::null;
953	return;
954	}
955
956	if (l == `0`) {
957	m_rep = &Rep::empty;
958	return;
959	}
960
961	UChar *d;
962	if (m_rep ->rc == `1` && l <= m_rep ->capacity && m_rep ->baseIsSelf() && m_rep ->offset == `0` && m_rep ->preCapacity == `0`) {
963	d = m_rep ->buf;
964	m_rep ->_hash = `0`;
965	m_rep ->len = l;
966	} else {
967	d = allocChars(l);
968	if (!d) {
969	m_rep = &Rep::null;
970	return;
971	}
972	m_rep = Rep::create(d, l);
973	}
974	for (int i = `0`; i < l; i++)
975	d[i].uc = static_cast<unsigned char>(c[i]);
976	}
977
978	bool UString::is8Bit() const
979	{
980	const UChar *u = data();
981	const UChar *limit = u + size();
982	while (u < limit) {
983	if (u->uc > `0xFF`)
984	return false;
985	++u;
986	}
987
988	return true;
989	}
990
991	const UChar UString::operator[](int pos) const
992	{
993	if (pos >= size())
994	return '\0';
995	return data()[pos];
996	}
997
998	double UString::toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const
999	{
1000	double d;
1001
1002	const int length = size();
1003	int leadingSpaces = `0`;
1004
1005	// skip leading white space
1006	while (leadingSpaces < length && CommonUnicode::isStrWhiteSpace(data()[leadingSpaces].uc))
1007	++leadingSpaces;
1008
1009	UString whitespaceSkipped = substr(leadingSpaces, length - leadingSpaces);
1010
1011	// FIXME: If tolerateTrailingJunk is true, then we want to tolerate non-8-bit junk
1012	// after the number, so is8Bit is too strict a check.
1013	if (!whitespaceSkipped.is8Bit())
1014	return NaN;
1015
1016	const char *c = whitespaceSkipped.ascii();
1017
1018	// empty string ?
1019	if (*c == '\0')
1020	return tolerateEmptyString ? `0.0` : NaN;
1021
1022	// hex number ?
1023	if (c == '0' && ((c+`1`) == 'x' \|\| *(c+`1`) == 'X')) {
1024	const char* firstDigitPosition = c + `2`;
1025	c++;
1026	d = `0.0`;
1027	while (*(++c)) {
1028	if (c >= '0' && c <= '9')
1029	d = d * `16.0` + *c - '0';
1030	else if ((c >= 'A' && c <= 'F') \|\| (c >= 'a' && c <= 'f'))
1031	d = d * `16.0` + (*c & `0xdf`) - 'A' + `10.0`;
1032	else
1033	break;
1034	}
1035
1036	if (d >= mantissaOverflowLowerBound)
1037	d = parseIntOverflow(firstDigitPosition, c - firstDigitPosition, `16`);
1038	} else {
1039	// regular number ?
1040	char *end;
1041	d = kjs_strtod(c, &end);
1042	if ((d != `0.0` \|\| end != c) && d != Inf && d != -Inf) {
1043	c = end;
1044	} else {
1045	double sign = `1.0`;
1046
1047	if (*c == '+')
1048	c++;
1049	else if (*c == '-') {
1050	sign = -`1.0`;
1051	c++;
1052	}
1053
1054	// We used strtod() to do the conversion. However, strtod() handles
1055	// infinite values slightly differently than JavaScript in that it
1056	// converts the string "inf" with any capitalization to infinity,
1057	// whereas the ECMA spec requires that it be converted to NaN.
1058
1059	if (strncmp(c, "Infinity", `8`) == `0`) {
1060	d = sign * Inf;
1061	c += `8`;
1062	} else if ((d == Inf \|\| d == -Inf) && c != 'I' && c != 'i')
1063	c = end;
1064	else
1065	return NaN;
1066	}
1067	}
1068
1069	// allow trailing white space
1070	while (isASCIISpace(*c))
1071	c++;
1072	// don't allow anything after - unless tolerant=true
1073	if (!tolerateTrailingJunk && *c != '\0')
1074	d = NaN;
1075
1076	return d;
1077	}
1078
1079	#ifdef __FAST_MATH__
1080	# error "KJS does not work correctly with -ffast-math"
1081	#endif
1082
1083	double UString::toDouble(bool tolerateTrailingJunk) const
1084	{
1085	return toDouble(tolerateTrailingJunk, true);
1086	}
1087
1088	double UString::toDouble() const
1089	{
1090	return toDouble(false, true);
1091	}
1092
1093	uint32_t UString::toStrictUInt32(bool ok) const*
1094	{
1095	if (ok)
1096	ok = false*;
1097
1098	// Empty string is not OK.
1099	int len = m_rep ->len;
1100	if (len == `0`)
1101	return `0`;
1102	const UChar *p = m_rep ->data();
1103	unsigned short c = p->unicode();
1104
1105	// If the first digit is 0, only 0 itself is OK.
1106	if (c == '0') {
1107	if (len == `1` && ok)
1108	ok = true*;
1109	return `0`;
1110	}
1111
1112	// Convert to UInt32, checking for overflow.
1113	uint32_t i = `0`;
1114	while (`1`) {
1115	// Process character, turning it into a digit.
1116	if (c < '0' \|\| c > '9')
1117	return `0`;
1118	const unsigned d = c - '0';
1119
1120	// Multiply by 10, checking for overflow out of 32 bits.
1121	if (i > `0xFFFFFFFFU` / `10`)
1122	return `0`;
1123	i *= `10`;
1124
1125	// Add in the digit, checking for overflow out of 32 bits.
1126	const unsigned max = `0xFFFFFFFFU` - d;
1127	if (i > max)
1128	return `0`;
1129	i += d;
1130
1131	// Handle end of string.
1132	if (--len == `0`) {
1133	if (ok)
1134	ok = true*;
1135	return i;
1136	}
1137
1138	// Get next character.
1139	c = (++p)->unicode();
1140	}
1141	}
1142
1143	int UString::find(const UString &f, int pos) const
1144	{
1145	int sz = size();
1146	int fsz = f.size();
1147	if (sz < fsz)
1148	return -`1`;
1149	if (pos < `0`)
1150	pos = `0`;
1151	if (fsz == `0`)
1152	return pos;
1153	const UChar* data_ = data();
1154	const UChar* end = data_ + sz - fsz;
1155	int fsizeminusone = (fsz - `1`) * sizeof(UChar);
1156	const UChar *fdata = f.data();
1157	unsigned short fchar = fdata->uc;
1158	++fdata;
1159	for (const UChar* c = data_ + pos; c <= end; c++)
1160	if (c->uc == fchar && !memcmp(c + `1`, fdata, fsizeminusone))
1161	return (c - data_);
1162
1163	return -`1`;
1164	}
1165
1166	int UString::find(UChar ch, int pos) const
1167	{
1168	if (pos < `0`)
1169	pos = `0`;
1170	const UChar* data_ = data();
1171	const UChar *end = data_ + size();
1172	for (const UChar *c = data_ + pos; c < end; c++)
1173	if (*c == ch)
1174	return (c - data_);
1175
1176	return -`1`;
1177	}
1178
1179	int UString::rfind(const UString &f, int pos) const
1180	{
1181	int sz = size();
1182	int fsz = f.size();
1183	if (sz < fsz)
1184	return -`1`;
1185	if (pos < `0`)
1186	pos = `0`;
1187	if (pos > sz - fsz)
1188	pos = sz - fsz;
1189	if (fsz == `0`)
1190	return pos;
1191	int fsizeminusone = (fsz - `1`) * sizeof(UChar);
1192	const UChar *fdata = f.data();
1193	const UChar* data_ = data();
1194	for (const UChar* c = data_ + pos; c >= data_; c--) {
1195	if (c == fdata && !memcmp(c + `1`, fdata + `1`, fsizeminusone))
1196	return (c - data_);
1197	}
1198
1199	return -`1`;
1200	}
1201
1202	int UString::rfind(UChar ch, int pos) const
1203	{
1204	if (isEmpty())
1205	return -`1`;
1206	if (pos + `1` >= size())
1207	pos = size() - `1`;
1208	const UChar* data_ = data();
1209	for (const UChar* c = data_ + pos; c >= data_; c--) {
1210	if (*c == ch)
1211	return (c - data_);
1212	}
1213
1214	return -`1`;
1215	}
1216
1217	UString UString::substr(int pos, int len) const
1218	{
1219	int s = size();
1220
1221	if (pos < `0`)
1222	pos = `0`;
1223	else if (pos >= s)
1224	pos = s;
1225	if (len < `0`)
1226	len = s;
1227	if (pos + len >= s)
1228	len = s - pos;
1229
1230	if (pos == `0` && len == s)
1231	return *this;
1232
1233	return UString(Rep::create(m_rep, pos, len));
1234	}
1235
1236	void UString::copyForWriting()
1237	{
1238	int l = size();
1239	if (!l) return; // Not going to touch anything anyway.
1240	if (m_rep ->rc > `1` \|\| !m_rep ->baseIsSelf()) {
1241	UChar* n = allocChars(l);
1242	memcpy(n, data(), l * sizeof(UChar));
1243	m_rep = Rep::create(n, l);
1244	}
1245	}
1246
1247	bool operator==(const UString& s1, const UString& s2)
1248	{
1249	#if 0
1250	if (s1.m_rep == s2.m_rep)
1251	return true;
1252	#endif
1253
1254	if (s1.m_rep ->len != s2.m_rep ->len)
1255	return false;
1256
1257	return (memcmp(s1.m_rep ->data(), s2.m_rep ->data(),
1258	s1.m_rep ->len * sizeof(UChar)) == `0`);
1259	}
1260
1261	bool operator==(const UString& s1, const char *s2)
1262	{
1263	if (s2 == `0`) {
1264	return s1.isEmpty();
1265	}
1266
1267	const UChar *u = s1.data();
1268	const UChar *uend = u + s1.size();
1269	while (u != uend && *s2) {
1270	if (u->uc != (unsigned char)*s2)
1271	return false;
1272	s2++;
1273	u++;
1274	}
1275
1276	return u == uend && *s2 == `0`;
1277	}
1278
1279	bool operator<(const UString& s1, const UString& s2)
1280	{
1281	const int l1 = s1.size();
1282	const int l2 = s2.size();
1283	const int lmin = l1 < l2 ? l1 : l2;
1284	const UChar *c1 = s1.data();
1285	const UChar *c2 = s2.data();
1286	int l = `0`;
1287	while (l < lmin && c1 == c2) {
1288	c1++;
1289	c2++;
1290	l++;
1291	}
1292	if (l < lmin)
1293	return (c1->uc < c2->uc);
1294
1295	return (l1 < l2);
1296	}
1297
1298	bool UString::equal(const UString::Rep r, const* UString::Rep *b)
1299	{
1300	if (r == b)
1301	return true;
1302
1303	int length = r->len;
1304	if (length != b->len)
1305	return false;
1306
1307	const UChar *d = r->data();
1308	const UChar *s = b->data();
1309	for (int i = `0`; i != length; ++i)
1310	if (d[i].uc != s[i].uc)
1311	return false;
1312	return true;
1313	}
1314
1315
1316	int compare(const UString& s1, const UString& s2)
1317	{
1318	const int l1 = s1.size();
1319	const int l2 = s2.size();
1320	const int lmin = l1 < l2 ? l1 : l2;
1321	const UChar *c1 = s1.data();
1322	const UChar *c2 = s2.data();
1323	int l = `0`;
1324	while (l < lmin && c1 == c2) {
1325	c1++;
1326	c2++;
1327	l++;
1328	}
1329
1330	if (l < lmin)
1331	return (c1->uc > c2->uc) ? `1` : -`1`;
1332
1333	if (l1 == l2)
1334	return `0`;
1335
1336	return (l1 > l2) ? `1` : -`1`;
1337	}
1338
1339	inline int inlineUTF8SequenceLengthNonASCII(char b0)
1340	{
1341	if ((b0 & `0xC0`) != `0xC0`)
1342	return `0`;
1343	if ((b0 & `0xE0`) == `0xC0`)
1344	return `2`;
1345	if ((b0 & `0xF0`) == `0xE0`)
1346	return `3`;
1347	if ((b0 & `0xF8`) == `0xF0`)
1348	return `4`;
1349	return `0`;
1350	}
1351
1352	int UTF8SequenceLengthNonASCII(char b0)
1353	{
1354	return inlineUTF8SequenceLengthNonASCII(b0);
1355	}
1356
1357	inline int inlineUTF8SequenceLength(char b0)
1358	{
1359	return (b0 & `0x80`) == `0` ? `1` : UTF8SequenceLengthNonASCII(b0);
1360	}
1361
1362	// Given a first byte, gives the length of the UTF-8 sequence it begins.
1363	// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
1364	// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
1365	int UTF8SequenceLength(char b0)
1366	{
1367	return (b0 & `0x80`) == `0` ? `1` : inlineUTF8SequenceLengthNonASCII(b0);
1368	}
1369
1370	// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
1371	// Only allows Unicode characters (U-00000000 to U-0010FFFF).
1372	// Returns -1 if the sequence is not valid (including presence of extra bytes).
1373	int decodeUTF8Sequence(const char *sequence)
1374	{
1375	// Handle 0-byte sequences (never valid).
1376	const unsigned char b0 = sequence[`0`];
1377	const int length = inlineUTF8SequenceLength(b0);
1378	if (length == `0`)
1379	return -`1`;
1380
1381	// Handle 1-byte sequences (plain ASCII).
1382	const unsigned char b1 = sequence[`1`];
1383	if (length == `1`) {
1384	if (b1)
1385	return -`1`;
1386	return b0;
1387	}
1388
1389	// Handle 2-byte sequences.
1390	if ((b1 & `0xC0`) != `0x80`)
1391	return -`1`;
1392	const unsigned char b2 = sequence[`2`];
1393	if (length == `2`) {
1394	if (b2)
1395	return -`1`;
1396	const int c = ((b0 & `0x1F`) << `6`) \| (b1 & `0x3F`);
1397	if (c < `0x80`)
1398	return -`1`;
1399	return c;
1400	}
1401
1402	// Handle 3-byte sequences.
1403	if ((b2 & `0xC0`) != `0x80`)
1404	return -`1`;
1405	const unsigned char b3 = sequence[`3`];
1406	if (length == `3`) {
1407	if (b3)
1408	return -`1`;
1409	const int c = ((b0 & `0xF`) << `12`) \| ((b1 & `0x3F`) << `6`) \| (b2 & `0x3F`);
1410	if (c < `0x800`)
1411	return -`1`;
1412	// UTF-16 surrogates should never appear in UTF-8 data.
1413	if (c >= `0xD800` && c <= `0xDFFF`)
1414	return -`1`;
1415	// Backwards BOM and U+FFFF should never appear in UTF-8 data.
1416	if (c == `0xFFFE` \|\| c == `0xFFFF`)
1417	return -`1`;
1418	return c;
1419	}
1420
1421	// Handle 4-byte sequences.
1422	if ((b3 & `0xC0`) != `0x80`)
1423	return -`1`;
1424	const unsigned char b4 = sequence[`4`];
1425	if (length == `4`) {
1426	if (b4)
1427	return -`1`;
1428	const int c = ((b0 & `0x7`) << `18`) \| ((b1 & `0x3F`) << `12`) \| ((b2 & `0x3F`) << `6`) \| (b3 & `0x3F`);
1429	if (c < `0x10000` \|\| c > `0x10FFFF`)
1430	return -`1`;
1431	return c;
1432	}
1433
1434	return -`1`;
1435	}
1436
1437	CString UString::UTF8String() const
1438	{
1439	// Allocate a buffer big enough to hold all the characters.
1440	const int length = size();
1441	Vector<char, `1024`> buffer(length * `3`);
1442
1443	// Convert to runs of 8-bit characters.
1444	char *p = buffer.begin();
1445	const unsigned short* d = &data()->uc;
1446	for (int i = `0`; i != length; ++i) {
1447	unsigned int c = d[i], sc;
1448	if (c < `0x80`) {
1449	p++ = (char*)c;
1450	} else if (c < `0x800`) {
1451	p++ = (char)((c >> `6`) \| `0xC0`); // C0 is the 2-byte flag for UTF-8*
1452	p++ = (char)((c \| `0x80`) & `0xBF`); // next 6 bits, with high bit set*
1453	} else if (c >= `0xD800` && c <= `0xDBFF` && (i+`1`) < length &&
1454	(sc = d[i+`1`]) >= `0xDC00` && sc <= `0xDFFF`) {
1455	sc = `0x10000` + (((c & `0x3FF`) << `10`) \| (sc & `0x3FF`));
1456	p++ = (char)((sc >> `18`) \| `0xF0`); // F0 is the 4-byte flag for UTF-8*
1457	p++ = (char)(((sc >> `12`) \| `0x80`) & `0xBF`); // next 6 bits, with high bit set*
1458	p++ = (char)(((sc >> `6`) \| `0x80`) & `0xBF`); // next 6 bits, with high bit set*
1459	p++ = (char)((sc \| `0x80`) & `0xBF`); // next 6 bits, with high bit set*
1460	++i;
1461	} else {
1462	p++ = (char)((c >> `12`) \| `0xE0`); // E0 is the 3-byte flag for UTF-8*
1463	p++ = (char)(((c >> `6`) \| `0x80`) & `0xBF`); // next 6 bits, with high bit set*
1464	p++ = (char)((c \| `0x80`) & `0xBF`); // next 6 bits, with high bit set*
1465	}
1466	}
1467
1468	// Return the result as a C string.
1469	CString result(buffer.data(), p - buffer.data());
1470
1471	return result;
1472	}
1473
1474	} // namespace KJS
1475