ustring.h [kdelibs/kjs/ustring.h]

1	// -- c-basic-offset: 2 --
2	/*
3	* This file is part of the KDE libraries
4	* Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
5	* Copyright (C) 2004 Apple Computer, Inc.
6	*
7	* This library is free software; you can redistribute it and/or
8	* modify it under the terms of the GNU Library General Public
9	* License as published by the Free Software Foundation; either
10	* version 2 of the License, or (at your option) any later version.
11	*
12	* This library is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15	* Library General Public License for more details.
16	*
17	* You should have received a copy of the GNU Library General Public License
18	* along with this library; see the file COPYING.LIB. If not, write to
19	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20	* Boston, MA 02110-1301, USA.
21	*
22	*/
23
24	#ifndef _KJS_USTRING_H_
25	#define _KJS_USTRING_H_
26
27	#include "kjs/global.h"
28
29	#include <wtf/AlwaysInline.h>
30	#include <wtf/FastMalloc.h>
31	#include <wtf/RefPtr.h>
32	#include <wtf/PassRefPtr.h>
33	#include <wtf/Vector.h>
34
35	#include <assert.h>
36	#include "collector.h"
37	#ifdef HAVE_STDINT_H
38	#include <stdint.h>
39	#endif
40
41	/ On some ARM platforms GCC won't pack structures by default so sizeof(UChar)*
42	will end up being != 2 which causes crashes since the code depends on that. /*
43	#if COMPILER(GCC) && PLATFORM(FORCE_PACK)
44	#define PACK_STRUCT __attribute__((packed))
45	#else
46	#define PACK_STRUCT
47	#endif
48
49	/**
50	* @internal
51	*/
52	namespace DOM {
53	class DOMString;
54	}
55	namespace khtml {
56	class AtomicString;
57	}
58	class QString;
59	class QConstString;
60
61	namespace KJS {
62
63	class UString;
64
65	/**
66	* @short Unicode character.
67	*
68	* UChar represents a 16 bit Unicode character. Its internal data
69	* representation is compatible to XChar2b and QChar. It's therefore
70	* possible to exchange data with X and Qt with shallow copies.
71	*/
72	struct KJS_EXPORT UChar {
73	/**
74	* Construct a character with uninitialized value.
75	*/
76	UChar();
77	/**
78	* Construct a character with the value denoted by the arguments.
79	* @param h higher byte
80	* @param l lower byte
81	*/
82	UChar(unsigned char h , unsigned char l);
83	/**
84	* Construct a character with the given value.
85	* @param u 16 bit Unicode value
86	*/
87	UChar(char u);
88	UChar(unsigned char u);
89	UChar(unsigned short u);
90	/**
91	* @return The higher byte of the character.
92	*/
93	unsigned char high() const { return static_cast<unsigned char>(uc >> `8`); }
94
95	/**
96	* @return The lower byte of the character.
97	*/
98	unsigned char low() const { return static_cast<unsigned char>(uc); }
99
100	/**
101	* @return the 16 bit Unicode value of the character
102	*/
103	unsigned short unicode() const { return uc; }
104
105	unsigned short uc;
106	} PACK_STRUCT;
107
108	inline UChar::UChar() { }
109	inline UChar::UChar(unsigned char h , unsigned char l) : uc(h << `8` \| l) { }
110	inline UChar::UChar(char u) : uc((unsigned char)u) { }
111	inline UChar::UChar(unsigned char u) : uc(u) { }
112	inline UChar::UChar(unsigned short u) : uc(u) { }
113
114	/**
115	* @short 8 bit char based string class
116	*/
117	class KJS_EXPORT CString {
118	public:
119	CString() : data(`0`), length(`0`) { }
120	CString(const char *c);
121	CString(const char *c, size_t len);
122	CString(const CString &);
123
124	~CString();
125
126	CString &operator=(const char *c);
127	CString &operator=(const CString &);
128
129	size_t size() const { return length; }
130	const char c_str() const* { return data; }
131	private:
132	char *data;
133	size_t length;
134	};
135
136	/**
137	* @short Unicode string class
138	*/
139	class KJS_EXPORT UString {
140	KJS_EXPORT friend bool operator==(const UString&, const UString&);
141
142	public:
143	/**
144	* @internal
145	*/
146	struct KJS_EXPORT Rep {
147
148	static PassRefPtr<Rep> create(UChar d, int* l);
149	static PassRefPtr<Rep> createCopying(const UChar d, int* l);
150	static PassRefPtr<Rep> create(PassRefPtr<Rep> base, int offset, int length);
151
152	void destroy();
153
154	bool baseIsSelf() const { return baseString == this; }
155	UChar* data() const { return baseString->buf + baseString->preCapacity + offset; }
156	int size() const { return len; }
157
158	unsigned hash() const { if (_hash == `0`) _hash = computeHash(data(), len); return _hash; }
159	unsigned computedHash() const { assert(_hash); return _hash; } // fast path for Identifiers
160	static unsigned computeHash(const UChar , int* length);
161	static unsigned computeHash(const char* s, int length);
162	static unsigned computeHash(const char *);
163
164	Rep* ref() { ++rc; return this; }
165	ALWAYS_INLINE void deref() { if (--rc == `0`) destroy(); }
166
167	// unshared data
168	int offset;
169	int len;
170	int rc;
171	mutable unsigned _hash;
172	bool isIdentifier;
173	UString::Rep* baseString;
174	size_t reportedCost;
175
176	// potentially shared data
177	UChar *buf;
178	int usedCapacity;
179	int capacity;
180	int usedPreCapacity;
181	int preCapacity;
182
183	static Rep null;
184	static Rep empty;
185	};
186
187	public:
188	/**
189	* Constructs a null string.
190	*/
191	UString();
192	/**
193	* Constructs an empty string.
194	*/
195	enum Empty { empty };
196	UString(Empty);
197	/**
198	* Constructs a string from the single character c.
199	*/
200	explicit UString(char c);
201	/**
202	* Constructs a string from a classical zero determined char string.
203	*/
204	UString(const char *c);
205	UString(const char* c, size_t length);
206	/**
207	* Constructs a string from an array of Unicode characters of the specified
208	* length.
209	*/
210	UString(const UChar c, int* length);
211	/**
212	* If copy is false the string data will be adopted.
213	* That means that the data will NOT be copied and the pointer will
214	* be deleted when the UString object is modified or destroyed.
215	* Behaviour defaults to a deep copy if copy is true.
216	*/
217	UString(UChar c, int* length, bool copy);
218	/**
219	* Copy constructor. Makes a shallow copy only.
220	*/
221	UString(const UString &s) : m_rep (s.m_rep) {}
222
223	UString(const Vector<UChar>& buffer);
224
225	/**
226	* Convenience declaration only ! You'll be on your own to write the
227	* implementation for a construction from QString.
228	*
229	* Note: feel free to contact me if you want to see a dummy header for
230	* your favorite FooString class here !
231	*/
232	KJS_EXTERNAL_EXPORT UString(const QString&);
233	/**
234	* Convenience declaration only ! See UString(const QString&).
235	*/
236	KJS_EXTERNAL_EXPORT UString(const DOM::DOMString&);
237	/**
238	* Convenience declaration only ! See UString(const QString&).
239	*/
240	KJS_EXTERNAL_EXPORT UString(const khtml::AtomicString&);
241
242	/**
243	* Concatenation constructor. Makes operator+ more efficient.
244	*/
245	UString(const UString &, const UString &);
246	/**
247	* Destructor.
248	*/
249	~UString() {}
250
251	/**
252	* Constructs a string from an int.
253	*/
254	static UString from(int i);
255	/**
256	* Constructs a string from an unsigned int.
257	*/
258	static UString from(unsigned int u);
259	/**
260	* Constructs a string from a long int.
261	*/
262	static UString from(long u);
263	/**
264	* Constructs a string from a double.
265	*/
266	static UString from(double d);
267
268
269	static bool equal(const UString::Rep* a, const UString::Rep* b);
270
271	struct Range {
272	public:
273	Range(int pos, int len) : position(pos), length(len) {}
274	Range() {}
275	int position;
276	int length;
277	};
278
279	UString spliceSubstringsWithSeparators(const Range substringRanges, int* rangeCount, const UString separators, int* separatorCount) const;
280
281	/**
282	* Append another string.
283	*/
284	UString& append(const UString& subStr, int subPos, int subLength = -`1`);
285	UString& append(const UString& t);
286	UString& append(const char* t);
287	UString& append(const char* t, int tSize);
288	UString& append(unsigned short);
289	UString& append(char c) { return append(static_cast<unsigned short>(static_cast<unsigned char>(c))); }
290	UString& append(UChar c) { return append(c.uc); }
291
292	/**
293	* @return The string converted to the 8-bit string type CString().
294	*/
295	CString cstring() const;
296	/**
297	* Convert the Unicode string to plain ASCII chars chopping of any higher
298	* bytes. This method should only be used for debugging purposes as it
299	* is neither Unicode safe nor free from side effects. In order not to
300	* waste any memory the char buffer is static and shared by all UString
301	* instances.
302	*/
303	char ascii() const*;
304
305	/**
306	* Convert the string to UTF-8, assuming it is UTF-16 encoded.
307	* Since this function is tolerant of badly formed UTF-16, it can create UTF-8
308	* strings that are invalid because they have characters in the range
309	* U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to
310	* be otherwise valid.
311	*/
312	CString UTF8String() const;
313
314	/**
315	* @see UString(const QString&).
316	*/
317	KJS_EXTERNAL_EXPORT DOM::DOMString domString() const;
318	/**
319	* @see UString(const QString&).
320	*/
321	KJS_EXTERNAL_EXPORT QString qstring() const;
322	/**
323	* @see UString(const QString&).
324	*/
325	KJS_EXTERNAL_EXPORT QConstString qconststring() const;
326
327	/**
328	* Assignment operator.
329	*/
330	UString &operator=(const char *c);
331	UString& operator=(Empty);
332	/**
333	* Appends the specified string.
334	*/
335	UString &operator+=(const UString &s) { return append(s); }
336	UString &operator+=(const char s) { return* append(s); }
337
338	/**
339	* @return A pointer to the internal Unicode data.
340	*/
341	const UChar* data() const { return m_rep ->data(); }
342	/**
343	* @return True if null.
344	*/
345	bool isNull() const { return (m_rep == &Rep::null); }
346	/**
347	* @return True if null or zero length.
348	*/
349	bool isEmpty() const { return (!m_rep ->len); }
350	/**
351	* Use this if you want to make sure that this string is a plain ASCII
352	* string. For example, if you don't want to lose any information when
353	* using cstring() or ascii().
354	*
355	* @return True if the string doesn't contain any non-ASCII characters.
356	*/
357	bool is8Bit() const;
358	/**
359	* @return The length of the string.
360	*/
361	int size() const { return m_rep ->size(); }
362	/**
363	* Const character at specified position.
364	*/
365	const UChar operator[](int pos) const;
366	/**
367	* Attempts an conversion to a number. Apart from floating point numbers,
368	* the algorithm will recognize hexadecimal representations (as
369	* indicated by a 0x or 0X prefix) and +/- Infinity.
370	* Returns NaN if the conversion failed.
371	* @param tolerateTrailingJunk if true, toDouble can tolerate garbage after the number.
372	* @param tolerateEmptyString if false, toDouble will turn an empty string into NaN rather than 0.
373	*/
374	double toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const;
375	double toDouble(bool tolerateTrailingJunk) const;
376	double toDouble() const;
377
378	/**
379	* Attempts an conversion to a 32-bit integer. ok will be set
380	* according to the success.
381	*/
382	uint32_t toStrictUInt32(bool ok = `0`) const*;
383
384	/**
385	* Attempts an conversion to an array index. The "ok" boolean will be set
386	* to true if it is a valid array index according to the rule from
387	* ECMA 15.2 about what an array index is. It must exactly match the string
388	* form of an unsigned integer, and be less than 2^32 - 1.
389	*/
390	unsigned toArrayIndex(bool ok = `0`) const*;
391
392	/**
393	* @return Position of first occurrence of f starting at position pos.
394	* -1 if the search was not successful.
395	*/
396	int find(const UString &f, int pos = `0`) const;
397	int find(UChar, int pos = `0`) const;
398	/**
399	* @return Position of first occurrence of f searching backwards from
400	* position pos.
401	* -1 if the search was not successful.
402	*/
403	int rfind(const UString &f, int pos) const;
404	int rfind(UChar, int pos) const;
405	/**
406	* @return The sub string starting at position pos and length len.
407	*/
408	UString substr(int pos = `0`, int len = -`1`) const;
409	/**
410	* Static instance of a null string.
411	*/
412	static const UString &null();
413
414	Rep* rep() const { return m_rep.get(); }
415	UString(PassRefPtr<Rep> r) : m_rep (r) { assert(m_rep); }
416	void copyForWriting();
417
418	size_t cost() const;
419	private:
420	size_t expandedSize(size_t size, size_t otherSize) const;
421	int usedCapacity() const;
422	int usedPreCapacity() const;
423	void expandCapacity(int requiredLength);
424	void expandPreCapacity(int requiredPreCap);
425	void set(const char* c, int len);
426
427	RefPtr<Rep> m_rep;
428	};
429
430	KJS_EXPORT inline bool operator==(const UChar &c1, const UChar &c2) {
431	return (c1.uc == c2.uc);
432	}
433	KJS_EXPORT bool operator==(const UString& s1, const UString& s2);
434	KJS_EXPORT inline bool operator!=(const UString& s1, const UString& s2) {
435	return !KJS::operator==(s1, s2);
436	}
437	KJS_EXPORT bool operator<(const UString& s1, const UString& s2);
438	KJS_EXPORT bool operator==(const UString& s1, const char *s2);
439	KJS_EXPORT inline bool operator!=(const UString& s1, const char *s2) {
440	return !KJS::operator==(s1, s2);
441	}
442	KJS_EXPORT inline bool operator==(const char s1, const* UString& s2) {
443	return operator==(s2, s1);
444	}
445	KJS_EXPORT inline bool operator!=(const char s1, const* UString& s2) {
446	return !KJS::operator==(s1, s2);
447	}
448	KJS_EXPORT bool operator==(const CString& s1, const CString& s2);
449	KJS_EXPORT inline UString operator+(const UString& s1, const UString& s2) {
450	return UString (s1, s2);
451	}
452
453	KJS_EXPORT int compare(const UString &, const UString &);
454
455	// Given a first byte, gives the length of the UTF-8 sequence it begins.
456	// Returns 0 for bytes that are not legal starts of UTF-8 sequences.
457	// Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
458	int UTF8SequenceLength(char);
459
460	// Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
461	// Only allows Unicode characters (U-00000000 to U-0010FFFF).
462	// Returns -1 if the sequence is not valid (including presence of extra bytes).
463	int decodeUTF8Sequence(const char *);
464
465	KJS_EXPORT inline UString::UString()
466	: m_rep (&Rep::null)
467	{
468	}
469
470	// Rule from ECMA 15.2 about what an array index is.
471	// Must exactly match string form of an unsigned integer, and be less than 2^32 - 1.
472	inline unsigned UString::toArrayIndex(bool ok) const*
473	{
474	unsigned i = toStrictUInt32(ok);
475	if (ok && i >= `0xFFFFFFFFU`)
476	ok = false*;
477	return i;
478	}
479
480	// We'd rather not do shared substring append for small strings, since
481	// this runs too much risk of a tiny initial string holding down a
482	// huge buffer.
483	// FIXME: this should be size_t but that would cause warnings until we
484	// fix UString sizes to be size_t instead of int
485	static const int minShareSize = Collector::minExtraCostSize / sizeof(UChar);
486
487	inline size_t UString::cost() const
488	{
489	size_t capacity = (m_rep ->baseString->capacity + m_rep ->baseString->preCapacity) * sizeof(UChar);
490	size_t reportedCost = m_rep ->baseString->reportedCost;
491	ASSERT(capacity >= reportedCost);
492
493	size_t capacityDelta = capacity - reportedCost;
494
495	if (capacityDelta < static_cast<size_t>(minShareSize))
496	return `0`;
497
498	m_rep ->baseString->reportedCost = capacity;
499	return capacityDelta;
500	}
501
502	} // namespace
503
504	namespace WTF {
505
506	template<typename T> struct DefaultHash;
507	template<typename T> struct StrHash;
508
509	template<> struct StrHash<KJS::UString::Rep *> {
510	static unsigned hash(const KJS::UString::Rep key) { return* key->hash(); }
511	static bool equal(const KJS::UString::Rep a, const* KJS::UString::Rep b) { return* KJS::UString::equal(a, b); }
512	static const bool safeToCompareToEmptyOrDeleted = false;
513	};
514
515	template<> struct DefaultHash<KJS::UString::Rep *> {
516	typedef StrHash<KJS::UString::Rep *> Hash;
517	};
518	} // namespace WTF
519
520
521	#endif
522