1 | // -*- c-basic-offset: 2 -*- |
2 | /* |
3 | * This file is part of the KDE libraries |
4 | * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) |
5 | * Copyright (C) 2004 Apple Computer, Inc. |
6 | * |
7 | * This library is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU Library General Public |
9 | * License as published by the Free Software Foundation; either |
10 | * version 2 of the License, or (at your option) any later version. |
11 | * |
12 | * This library is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | * Library General Public License for more details. |
16 | * |
17 | * You should have received a copy of the GNU Library General Public License |
18 | * along with this library; see the file COPYING.LIB. If not, write to |
19 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
20 | * Boston, MA 02110-1301, USA. |
21 | * |
22 | */ |
23 | |
24 | #ifndef _KJS_USTRING_H_ |
25 | #define _KJS_USTRING_H_ |
26 | |
27 | #include "kjs/global.h" |
28 | |
29 | #include <wtf/AlwaysInline.h> |
30 | #include <wtf/FastMalloc.h> |
31 | #include <wtf/RefPtr.h> |
32 | #include <wtf/PassRefPtr.h> |
33 | #include <wtf/Vector.h> |
34 | |
35 | #include <assert.h> |
36 | #include "collector.h" |
37 | #ifdef HAVE_STDINT_H |
38 | #include <stdint.h> |
39 | #endif |
40 | |
41 | /* On some ARM platforms GCC won't pack structures by default so sizeof(UChar) |
42 | will end up being != 2 which causes crashes since the code depends on that. */ |
43 | #if COMPILER(GCC) && PLATFORM(FORCE_PACK) |
44 | #define PACK_STRUCT __attribute__((packed)) |
45 | #else |
46 | #define PACK_STRUCT |
47 | #endif |
48 | |
49 | /** |
50 | * @internal |
51 | */ |
52 | namespace DOM { |
53 | class DOMString; |
54 | } |
55 | namespace khtml { |
56 | class AtomicString; |
57 | } |
58 | class QString; |
59 | class QConstString; |
60 | |
61 | namespace KJS { |
62 | |
63 | class UString; |
64 | |
65 | /** |
66 | * @short Unicode character. |
67 | * |
68 | * UChar represents a 16 bit Unicode character. Its internal data |
69 | * representation is compatible to XChar2b and QChar. It's therefore |
70 | * possible to exchange data with X and Qt with shallow copies. |
71 | */ |
72 | struct KJS_EXPORT UChar { |
73 | /** |
74 | * Construct a character with uninitialized value. |
75 | */ |
76 | UChar(); |
77 | /** |
78 | * Construct a character with the value denoted by the arguments. |
79 | * @param h higher byte |
80 | * @param l lower byte |
81 | */ |
82 | UChar(unsigned char h , unsigned char l); |
83 | /** |
84 | * Construct a character with the given value. |
85 | * @param u 16 bit Unicode value |
86 | */ |
87 | UChar(char u); |
88 | UChar(unsigned char u); |
89 | UChar(unsigned short u); |
90 | /** |
91 | * @return The higher byte of the character. |
92 | */ |
93 | unsigned char high() const { return static_cast<unsigned char>(uc >> 8); } |
94 | |
95 | /** |
96 | * @return The lower byte of the character. |
97 | */ |
98 | unsigned char low() const { return static_cast<unsigned char>(uc); } |
99 | |
100 | /** |
101 | * @return the 16 bit Unicode value of the character |
102 | */ |
103 | unsigned short unicode() const { return uc; } |
104 | |
105 | unsigned short uc; |
106 | } PACK_STRUCT; |
107 | |
108 | inline UChar::UChar() { } |
109 | inline UChar::UChar(unsigned char h , unsigned char l) : uc(h << 8 | l) { } |
110 | inline UChar::UChar(char u) : uc((unsigned char)u) { } |
111 | inline UChar::UChar(unsigned char u) : uc(u) { } |
112 | inline UChar::UChar(unsigned short u) : uc(u) { } |
113 | |
114 | /** |
115 | * @short 8 bit char based string class |
116 | */ |
117 | class KJS_EXPORT CString { |
118 | public: |
119 | CString() : data(0), length(0) { } |
120 | CString(const char *c); |
121 | CString(const char *c, size_t len); |
122 | CString(const CString &); |
123 | |
124 | ~CString(); |
125 | |
126 | CString &operator=(const char *c); |
127 | CString &operator=(const CString &); |
128 | |
129 | size_t size() const { return length; } |
130 | const char *c_str() const { return data; } |
131 | private: |
132 | char *data; |
133 | size_t length; |
134 | }; |
135 | |
136 | /** |
137 | * @short Unicode string class |
138 | */ |
139 | class KJS_EXPORT UString { |
140 | KJS_EXPORT friend bool operator==(const UString&, const UString&); |
141 | |
142 | public: |
143 | /** |
144 | * @internal |
145 | */ |
146 | struct KJS_EXPORT Rep { |
147 | |
148 | static PassRefPtr<Rep> create(UChar *d, int l); |
149 | static PassRefPtr<Rep> createCopying(const UChar *d, int l); |
150 | static PassRefPtr<Rep> create(PassRefPtr<Rep> base, int offset, int length); |
151 | |
152 | void destroy(); |
153 | |
154 | bool baseIsSelf() const { return baseString == this; } |
155 | UChar* data() const { return baseString->buf + baseString->preCapacity + offset; } |
156 | int size() const { return len; } |
157 | |
158 | unsigned hash() const { if (_hash == 0) _hash = computeHash(data(), len); return _hash; } |
159 | unsigned computedHash() const { assert(_hash); return _hash; } // fast path for Identifiers |
160 | static unsigned computeHash(const UChar *, int length); |
161 | static unsigned computeHash(const char* s, int length); |
162 | static unsigned computeHash(const char *); |
163 | |
164 | Rep* ref() { ++rc; return this; } |
165 | ALWAYS_INLINE void deref() { if (--rc == 0) destroy(); } |
166 | |
167 | // unshared data |
168 | int offset; |
169 | int len; |
170 | int rc; |
171 | mutable unsigned _hash; |
172 | bool isIdentifier; |
173 | UString::Rep* baseString; |
174 | size_t reportedCost; |
175 | |
176 | // potentially shared data |
177 | UChar *buf; |
178 | int usedCapacity; |
179 | int capacity; |
180 | int usedPreCapacity; |
181 | int preCapacity; |
182 | |
183 | static Rep null; |
184 | static Rep empty; |
185 | }; |
186 | |
187 | public: |
188 | /** |
189 | * Constructs a null string. |
190 | */ |
191 | UString(); |
192 | /** |
193 | * Constructs an empty string. |
194 | */ |
195 | enum Empty { empty }; |
196 | UString(Empty); |
197 | /** |
198 | * Constructs a string from the single character c. |
199 | */ |
200 | explicit UString(char c); |
201 | /** |
202 | * Constructs a string from a classical zero determined char string. |
203 | */ |
204 | UString(const char *c); |
205 | UString(const char* c, size_t length); |
206 | /** |
207 | * Constructs a string from an array of Unicode characters of the specified |
208 | * length. |
209 | */ |
210 | UString(const UChar *c, int length); |
211 | /** |
212 | * If copy is false the string data will be adopted. |
213 | * That means that the data will NOT be copied and the pointer will |
214 | * be deleted when the UString object is modified or destroyed. |
215 | * Behaviour defaults to a deep copy if copy is true. |
216 | */ |
217 | UString(UChar *c, int length, bool copy); |
218 | /** |
219 | * Copy constructor. Makes a shallow copy only. |
220 | */ |
221 | UString(const UString &s) : m_rep(s.m_rep) {} |
222 | |
223 | UString(const Vector<UChar>& buffer); |
224 | |
225 | /** |
226 | * Convenience declaration only ! You'll be on your own to write the |
227 | * implementation for a construction from QString. |
228 | * |
229 | * Note: feel free to contact me if you want to see a dummy header for |
230 | * your favorite FooString class here ! |
231 | */ |
232 | KJS_EXTERNAL_EXPORT UString(const QString&); |
233 | /** |
234 | * Convenience declaration only ! See UString(const QString&). |
235 | */ |
236 | KJS_EXTERNAL_EXPORT UString(const DOM::DOMString&); |
237 | /** |
238 | * Convenience declaration only ! See UString(const QString&). |
239 | */ |
240 | KJS_EXTERNAL_EXPORT UString(const khtml::AtomicString&); |
241 | |
242 | /** |
243 | * Concatenation constructor. Makes operator+ more efficient. |
244 | */ |
245 | UString(const UString &, const UString &); |
246 | /** |
247 | * Destructor. |
248 | */ |
249 | ~UString() {} |
250 | |
251 | /** |
252 | * Constructs a string from an int. |
253 | */ |
254 | static UString from(int i); |
255 | /** |
256 | * Constructs a string from an unsigned int. |
257 | */ |
258 | static UString from(unsigned int u); |
259 | /** |
260 | * Constructs a string from a long int. |
261 | */ |
262 | static UString from(long u); |
263 | /** |
264 | * Constructs a string from a double. |
265 | */ |
266 | static UString from(double d); |
267 | |
268 | |
269 | static bool equal(const UString::Rep* a, const UString::Rep* b); |
270 | |
271 | struct Range { |
272 | public: |
273 | Range(int pos, int len) : position(pos), length(len) {} |
274 | Range() {} |
275 | int position; |
276 | int length; |
277 | }; |
278 | |
279 | UString spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const; |
280 | |
281 | /** |
282 | * Append another string. |
283 | */ |
284 | UString& append(const UString& subStr, int subPos, int subLength = -1); |
285 | UString& append(const UString& t); |
286 | UString& append(const char* t); |
287 | UString& append(const char* t, int tSize); |
288 | UString& append(unsigned short); |
289 | UString& append(char c) { return append(static_cast<unsigned short>(static_cast<unsigned char>(c))); } |
290 | UString& append(UChar c) { return append(c.uc); } |
291 | |
292 | /** |
293 | * @return The string converted to the 8-bit string type CString(). |
294 | */ |
295 | CString cstring() const; |
296 | /** |
297 | * Convert the Unicode string to plain ASCII chars chopping of any higher |
298 | * bytes. This method should only be used for *debugging* purposes as it |
299 | * is neither Unicode safe nor free from side effects. In order not to |
300 | * waste any memory the char buffer is static and *shared* by all UString |
301 | * instances. |
302 | */ |
303 | char *ascii() const; |
304 | |
305 | /** |
306 | * Convert the string to UTF-8, assuming it is UTF-16 encoded. |
307 | * Since this function is tolerant of badly formed UTF-16, it can create UTF-8 |
308 | * strings that are invalid because they have characters in the range |
309 | * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to |
310 | * be otherwise valid. |
311 | */ |
312 | CString UTF8String() const; |
313 | |
314 | /** |
315 | * @see UString(const QString&). |
316 | */ |
317 | KJS_EXTERNAL_EXPORT DOM::DOMString domString() const; |
318 | /** |
319 | * @see UString(const QString&). |
320 | */ |
321 | KJS_EXTERNAL_EXPORT QString qstring() const; |
322 | /** |
323 | * @see UString(const QString&). |
324 | */ |
325 | KJS_EXTERNAL_EXPORT QConstString qconststring() const; |
326 | |
327 | /** |
328 | * Assignment operator. |
329 | */ |
330 | UString &operator=(const char *c); |
331 | UString& operator=(Empty); |
332 | /** |
333 | * Appends the specified string. |
334 | */ |
335 | UString &operator+=(const UString &s) { return append(s); } |
336 | UString &operator+=(const char *s) { return append(s); } |
337 | |
338 | /** |
339 | * @return A pointer to the internal Unicode data. |
340 | */ |
341 | const UChar* data() const { return m_rep->data(); } |
342 | /** |
343 | * @return True if null. |
344 | */ |
345 | bool isNull() const { return (m_rep == &Rep::null); } |
346 | /** |
347 | * @return True if null or zero length. |
348 | */ |
349 | bool isEmpty() const { return (!m_rep->len); } |
350 | /** |
351 | * Use this if you want to make sure that this string is a plain ASCII |
352 | * string. For example, if you don't want to lose any information when |
353 | * using cstring() or ascii(). |
354 | * |
355 | * @return True if the string doesn't contain any non-ASCII characters. |
356 | */ |
357 | bool is8Bit() const; |
358 | /** |
359 | * @return The length of the string. |
360 | */ |
361 | int size() const { return m_rep->size(); } |
362 | /** |
363 | * Const character at specified position. |
364 | */ |
365 | const UChar operator[](int pos) const; |
366 | /** |
367 | * Attempts an conversion to a number. Apart from floating point numbers, |
368 | * the algorithm will recognize hexadecimal representations (as |
369 | * indicated by a 0x or 0X prefix) and +/- Infinity. |
370 | * Returns NaN if the conversion failed. |
371 | * @param tolerateTrailingJunk if true, toDouble can tolerate garbage after the number. |
372 | * @param tolerateEmptyString if false, toDouble will turn an empty string into NaN rather than 0. |
373 | */ |
374 | double toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const; |
375 | double toDouble(bool tolerateTrailingJunk) const; |
376 | double toDouble() const; |
377 | |
378 | /** |
379 | * Attempts an conversion to a 32-bit integer. ok will be set |
380 | * according to the success. |
381 | */ |
382 | uint32_t toStrictUInt32(bool *ok = 0) const; |
383 | |
384 | /** |
385 | * Attempts an conversion to an array index. The "ok" boolean will be set |
386 | * to true if it is a valid array index according to the rule from |
387 | * ECMA 15.2 about what an array index is. It must exactly match the string |
388 | * form of an unsigned integer, and be less than 2^32 - 1. |
389 | */ |
390 | unsigned toArrayIndex(bool *ok = 0) const; |
391 | |
392 | /** |
393 | * @return Position of first occurrence of f starting at position pos. |
394 | * -1 if the search was not successful. |
395 | */ |
396 | int find(const UString &f, int pos = 0) const; |
397 | int find(UChar, int pos = 0) const; |
398 | /** |
399 | * @return Position of first occurrence of f searching backwards from |
400 | * position pos. |
401 | * -1 if the search was not successful. |
402 | */ |
403 | int rfind(const UString &f, int pos) const; |
404 | int rfind(UChar, int pos) const; |
405 | /** |
406 | * @return The sub string starting at position pos and length len. |
407 | */ |
408 | UString substr(int pos = 0, int len = -1) const; |
409 | /** |
410 | * Static instance of a null string. |
411 | */ |
412 | static const UString &null(); |
413 | |
414 | Rep* rep() const { return m_rep.get(); } |
415 | UString(PassRefPtr<Rep> r) : m_rep(r) { assert(m_rep); } |
416 | void copyForWriting(); |
417 | |
418 | size_t cost() const; |
419 | private: |
420 | size_t expandedSize(size_t size, size_t otherSize) const; |
421 | int usedCapacity() const; |
422 | int usedPreCapacity() const; |
423 | void expandCapacity(int requiredLength); |
424 | void expandPreCapacity(int requiredPreCap); |
425 | void set(const char* c, int len); |
426 | |
427 | RefPtr<Rep> m_rep; |
428 | }; |
429 | |
430 | KJS_EXPORT inline bool operator==(const UChar &c1, const UChar &c2) { |
431 | return (c1.uc == c2.uc); |
432 | } |
433 | KJS_EXPORT bool operator==(const UString& s1, const UString& s2); |
434 | KJS_EXPORT inline bool operator!=(const UString& s1, const UString& s2) { |
435 | return !KJS::operator==(s1, s2); |
436 | } |
437 | KJS_EXPORT bool operator<(const UString& s1, const UString& s2); |
438 | KJS_EXPORT bool operator==(const UString& s1, const char *s2); |
439 | KJS_EXPORT inline bool operator!=(const UString& s1, const char *s2) { |
440 | return !KJS::operator==(s1, s2); |
441 | } |
442 | KJS_EXPORT inline bool operator==(const char *s1, const UString& s2) { |
443 | return operator==(s2, s1); |
444 | } |
445 | KJS_EXPORT inline bool operator!=(const char *s1, const UString& s2) { |
446 | return !KJS::operator==(s1, s2); |
447 | } |
448 | KJS_EXPORT bool operator==(const CString& s1, const CString& s2); |
449 | KJS_EXPORT inline UString operator+(const UString& s1, const UString& s2) { |
450 | return UString(s1, s2); |
451 | } |
452 | |
453 | KJS_EXPORT int compare(const UString &, const UString &); |
454 | |
455 | // Given a first byte, gives the length of the UTF-8 sequence it begins. |
456 | // Returns 0 for bytes that are not legal starts of UTF-8 sequences. |
457 | // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). |
458 | int UTF8SequenceLength(char); |
459 | |
460 | // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. |
461 | // Only allows Unicode characters (U-00000000 to U-0010FFFF). |
462 | // Returns -1 if the sequence is not valid (including presence of extra bytes). |
463 | int decodeUTF8Sequence(const char *); |
464 | |
465 | KJS_EXPORT inline UString::UString() |
466 | : m_rep(&Rep::null) |
467 | { |
468 | } |
469 | |
470 | // Rule from ECMA 15.2 about what an array index is. |
471 | // Must exactly match string form of an unsigned integer, and be less than 2^32 - 1. |
472 | inline unsigned UString::toArrayIndex(bool *ok) const |
473 | { |
474 | unsigned i = toStrictUInt32(ok); |
475 | if (ok && i >= 0xFFFFFFFFU) |
476 | *ok = false; |
477 | return i; |
478 | } |
479 | |
480 | // We'd rather not do shared substring append for small strings, since |
481 | // this runs too much risk of a tiny initial string holding down a |
482 | // huge buffer. |
483 | // FIXME: this should be size_t but that would cause warnings until we |
484 | // fix UString sizes to be size_t instead of int |
485 | static const int minShareSize = Collector::minExtraCostSize / sizeof(UChar); |
486 | |
487 | inline size_t UString::cost() const |
488 | { |
489 | size_t capacity = (m_rep->baseString->capacity + m_rep->baseString->preCapacity) * sizeof(UChar); |
490 | size_t reportedCost = m_rep->baseString->reportedCost; |
491 | ASSERT(capacity >= reportedCost); |
492 | |
493 | size_t capacityDelta = capacity - reportedCost; |
494 | |
495 | if (capacityDelta < static_cast<size_t>(minShareSize)) |
496 | return 0; |
497 | |
498 | m_rep->baseString->reportedCost = capacity; |
499 | return capacityDelta; |
500 | } |
501 | |
502 | } // namespace |
503 | |
504 | namespace WTF { |
505 | |
506 | template<typename T> struct DefaultHash; |
507 | template<typename T> struct StrHash; |
508 | |
509 | template<> struct StrHash<KJS::UString::Rep *> { |
510 | static unsigned hash(const KJS::UString::Rep *key) { return key->hash(); } |
511 | static bool equal(const KJS::UString::Rep *a, const KJS::UString::Rep *b) { return KJS::UString::equal(a, b); } |
512 | static const bool safeToCompareToEmptyOrDeleted = false; |
513 | }; |
514 | |
515 | template<> struct DefaultHash<KJS::UString::Rep *> { |
516 | typedef StrHash<KJS::UString::Rep *> Hash; |
517 | }; |
518 | } // namespace WTF |
519 | |
520 | |
521 | #endif |
522 | |