StringRef.cpp source code [llvm/lib/Support/StringRef.cpp]

1	//===-- StringRef.cpp - Lightweight String References ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "llvm/ADT/StringRef.h"
10	#include "llvm/ADT/APFloat.h"
11	#include "llvm/ADT/APInt.h"
12	#include "llvm/ADT/Hashing.h"
13	#include "llvm/ADT/StringExtras.h"
14	#include "llvm/ADT/edit_distance.h"
15	#include "llvm/Support/Error.h"
16	#include <bitset>
17
18	using namespace llvm;
19
20	// MSVC emits references to this into the translation units which reference it.
21	#ifndef _MSC_VER
22	constexpr size_t StringRef::npos;
23	#endif
24
25	// strncasecmp() is not available on non-POSIX systems, so define an
26	// alternative function here.
27	static int ascii_strncasecmp(const char LHS, const* char *RHS, size_t Length) {
28	for (size_t I = `0`; I < Length; ++I) {
29	unsigned char LHC = toLower(x: LHS[I]);
30	unsigned char RHC = toLower(x: RHS[I]);
31	if (LHC != RHC)
32	return LHC < RHC ? -`1` : `1`;
33	}
34	return `0`;
35	}
36
37	int StringRef::compare_insensitive(StringRef RHS) const {
38	if (int Res = ascii_strncasecmp(LHS: Data, RHS: RHS.Data, Length: std::min(a: Length, b: RHS.Length)))
39	return Res;
40	if (Length == RHS.Length)
41	return `0`;
42	return Length < RHS.Length ? -`1` : `1`;
43	}
44
45	bool StringRef::starts_with_insensitive(StringRef Prefix) const {
46	return Length >= Prefix.Length &&
47	ascii_strncasecmp(LHS: Data, RHS: Prefix.Data, Length: Prefix.Length) == `0`;
48	}
49
50	bool StringRef::ends_with_insensitive(StringRef Suffix) const {
51	return Length >= Suffix.Length &&
52	ascii_strncasecmp(LHS: end() - Suffix.Length, RHS: Suffix.Data, Length: Suffix.Length) == `0`;
53	}
54
55	size_t StringRef::find_insensitive(char C, size_t From) const {
56	char L = toLower(x: C);
57	return find_if(F: [L](char D) { return toLower(x: D) == L; }, From);
58	}
59
60	/// compare_numeric - Compare strings, handle embedded numbers.
61	int StringRef::compare_numeric(StringRef RHS) const {
62	for (size_t I = `0`, E = std::min(a: Length, b: RHS.Length); I != E; ++I) {
63	// Check for sequences of digits.
64	if (isDigit(C: Data[I]) && isDigit(C: RHS.Data[I])) {
65	// The longer sequence of numbers is considered larger.
66	// This doesn't really handle prefixed zeros well.
67	size_t J;
68	for (J = I + `1`; J != E + `1`; ++J) {
69	bool ld = J < Length && isDigit(C: Data[J]);
70	bool rd = J < RHS.Length && isDigit(C: RHS.Data[J]);
71	if (ld != rd)
72	return rd ? -`1` : `1`;
73	if (!rd)
74	break;
75	}
76	// The two number sequences have the same length (J-I), just memcmp them.
77	if (int Res = compareMemory(Lhs: Data + I, Rhs: RHS.Data + I, Length: J - I))
78	return Res < `0` ? -`1` : `1`;
79	// Identical number sequences, continue search after the numbers.
80	I = J - `1`;
81	continue;
82	}
83	if (Data[I] != RHS.Data[I])
84	return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -`1` : `1`;
85	}
86	if (Length == RHS.Length)
87	return `0`;
88	return Length < RHS.Length ? -`1` : `1`;
89	}
90
91	// Compute the edit distance between the two given strings.
92	unsigned StringRef::edit_distance(llvm::StringRef Other,
93	bool AllowReplacements,
94	unsigned MaxEditDistance) const {
95	return llvm::ComputeEditDistance(FromArray: ArrayRef(data(), size()),
96	ToArray: ArrayRef(Other.data(), Other.size()),
97	AllowReplacements, MaxEditDistance);
98	}
99
100	unsigned llvm::StringRef::edit_distance_insensitive(
101	StringRef Other, bool AllowReplacements, unsigned MaxEditDistance) const {
102	return llvm::ComputeMappedEditDistance(
103	FromArray: ArrayRef(data(), size()), ToArray: ArrayRef(Other.data(), Other.size()),
104	Map: llvm::toLower, AllowReplacements, MaxEditDistance);
105	}
106
107	//===----------------------------------------------------------------------===//
108	// String Operations
109	//===----------------------------------------------------------------------===//
110
111	std::string StringRef::lower() const {
112	return std::string (map_iterator(I: begin(), F: toLower),
113	map_iterator(I: end(), F: toLower));
114	}
115
116	std::string StringRef::upper() const {
117	return std::string (map_iterator(I: begin(), F: toUpper),
118	map_iterator(I: end(), F: toUpper));
119	}
120
121	//===----------------------------------------------------------------------===//
122	// String Searching
123	//===----------------------------------------------------------------------===//
124
125
126	/// find - Search for the first string \arg Str in the string.
127	///
128	/// \return - The index of the first occurrence of \arg Str, or npos if not
129	/// found.
130	size_t StringRef::find(StringRef Str, size_t From) const {
131	if (From > Length)
132	return npos;
133
134	const char *Start = Data + From;
135	size_t Size = Length - From;
136
137	const char *Needle = Str.data();
138	size_t N = Str.size();
139	if (N == `0`)
140	return From;
141	if (Size < N)
142	return npos;
143	if (N == `1`) {
144	const char Ptr = (const* char *)::memchr(s: Start, c: Needle[`0`], n: Size);
145	return Ptr == nullptr ? npos : Ptr - Data;
146	}
147
148	const char *Stop = Start + (Size - N + `1`);
149
150	if (N == `2`) {
151	// Provide a fast path for newline finding (CRLF case) in InclusionRewriter.
152	// Not the most optimized strategy, but getting memcmp inlined should be
153	// good enough.
154	do {
155	if (std::memcmp(s1: Start, s2: Needle, n: `2`) == `0`)
156	return Start - Data;
157	++Start;
158	} while (Start < Stop);
159	return npos;
160	}
161
162	// For short haystacks or unsupported needles fall back to the naive algorithm
163	if (Size < `16` \|\| N > `255`) {
164	do {
165	if (std::memcmp(s1: Start, s2: Needle, n: N) == `0`)
166	return Start - Data;
167	++Start;
168	} while (Start < Stop);
169	return npos;
170	}
171
172	// Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
173	uint8_t BadCharSkip[`256`];
174	std::memset(s: BadCharSkip, c: N, n: `256`);
175	for (unsigned i = `0`; i != N-`1`; ++i)
176	BadCharSkip[(uint8_t)Str [i]] = N-`1`-i;
177
178	do {
179	uint8_t Last = Start[N - `1`];
180	if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - `1`]))
181	if (std::memcmp(s1: Start, s2: Needle, n: N - `1`) == `0`)
182	return Start - Data;
183
184	// Otherwise skip the appropriate number of bytes.
185	Start += BadCharSkip[Last];
186	} while (Start < Stop);
187
188	return npos;
189	}
190
191	size_t StringRef::find_insensitive(StringRef Str, size_t From) const {
192	StringRef This = substr(Start: From);
193	while (This.size() >= Str.size()) {
194	if (This.starts_with_insensitive(Prefix: Str))
195	return From;
196	This = This.drop_front();
197	++From;
198	}
199	return npos;
200	}
201
202	size_t StringRef::rfind_insensitive(char C, size_t From) const {
203	From = std::min(a: From, b: Length);
204	size_t i = From;
205	while (i != `0`) {
206	--i;
207	if (toLower(x: Data[i]) == toLower(x: C))
208	return i;
209	}
210	return npos;
211	}
212
213	/// rfind - Search for the last string \arg Str in the string.
214	///
215	/// \return - The index of the last occurrence of \arg Str, or npos if not
216	/// found.
217	size_t StringRef::rfind(StringRef Str) const {
218	return std::string_view(*this).rfind(str: Str);
219	}
220
221	size_t StringRef::rfind_insensitive(StringRef Str) const {
222	size_t N = Str.size();
223	if (N > Length)
224	return npos;
225	for (size_t i = Length - N + `1`, e = `0`; i != e;) {
226	--i;
227	if (substr(Start: i, N).equals_insensitive(RHS: Str))
228	return i;
229	}
230	return npos;
231	}
232
233	/// find_first_of - Find the first character in the string that is in \arg
234	/// Chars, or npos if not found.
235	///
236	/// Note: O(size() + Chars.size())
237	StringRef::size_type StringRef::find_first_of(StringRef Chars,
238	size_t From) const {
239	std::bitset<`1` << CHAR_BIT> CharBits;
240	for (char C : Chars)
241	CharBits.set(position: (unsigned char)C);
242
243	for (size_type i = std::min(a: From, b: Length), e = Length; i != e; ++i)
244	if (CharBits.test(position: (unsigned char)Data[i]))
245	return i;
246	return npos;
247	}
248
249	/// find_first_not_of - Find the first character in the string that is not
250	/// \arg C or npos if not found.
251	StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
252	return std::string_view(*this).find_first_not_of(c: C, pos: From);
253	}
254
255	/// find_first_not_of - Find the first character in the string that is not
256	/// in the string \arg Chars, or npos if not found.
257	///
258	/// Note: O(size() + Chars.size())
259	StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
260	size_t From) const {
261	std::bitset<`1` << CHAR_BIT> CharBits;
262	for (char C : Chars)
263	CharBits.set(position: (unsigned char)C);
264
265	for (size_type i = std::min(a: From, b: Length), e = Length; i != e; ++i)
266	if (!CharBits.test(position: (unsigned char)Data[i]))
267	return i;
268	return npos;
269	}
270
271	/// find_last_of - Find the last character in the string that is in \arg C,
272	/// or npos if not found.
273	///
274	/// Note: O(size() + Chars.size())
275	StringRef::size_type StringRef::find_last_of(StringRef Chars,
276	size_t From) const {
277	std::bitset<`1` << CHAR_BIT> CharBits;
278	for (char C : Chars)
279	CharBits.set(position: (unsigned char)C);
280
281	for (size_type i = std::min(a: From, b: Length) - `1`, e = -`1`; i != e; --i)
282	if (CharBits.test(position: (unsigned char)Data[i]))
283	return i;
284	return npos;
285	}
286
287	/// find_last_not_of - Find the last character in the string that is not
288	/// \arg C, or npos if not found.
289	StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
290	for (size_type i = std::min(a: From, b: Length) - `1`, e = -`1`; i != e; --i)
291	if (Data[i] != C)
292	return i;
293	return npos;
294	}
295
296	/// find_last_not_of - Find the last character in the string that is not in
297	/// \arg Chars, or npos if not found.
298	///
299	/// Note: O(size() + Chars.size())
300	StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
301	size_t From) const {
302	std::bitset<`1` << CHAR_BIT> CharBits;
303	for (char C : Chars)
304	CharBits.set(position: (unsigned char)C);
305
306	for (size_type i = std::min(a: From, b: Length) - `1`, e = -`1`; i != e; --i)
307	if (!CharBits.test(position: (unsigned char)Data[i]))
308	return i;
309	return npos;
310	}
311
312	void StringRef::split(SmallVectorImpl<StringRef> &A,
313	StringRef Separator, int MaxSplit,
314	bool KeepEmpty) const {
315	StringRef S = *this;
316
317	// Count down from MaxSplit. When MaxSplit is -1, this will just split
318	// "forever". This doesn't support splitting more than 2^31 times
319	// intentionally; if we ever want that we can make MaxSplit a 64-bit integer
320	// but that seems unlikely to be useful.
321	while (MaxSplit-- != `0`) {
322	size_t Idx = S.find(Str: Separator);
323	if (Idx == npos)
324	break;
325
326	// Push this split.
327	if (KeepEmpty \|\| Idx > `0`)
328	A.push_back(Elt: S.slice(Start: `0`, End: Idx));
329
330	// Jump forward.
331	S = S.slice(Start: Idx + Separator.size(), End: npos);
332	}
333
334	// Push the tail.
335	if (KeepEmpty \|\| !S.empty())
336	A.push_back(Elt: S);
337	}
338
339	void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
340	int MaxSplit, bool KeepEmpty) const {
341	StringRef S = *this;
342
343	// Count down from MaxSplit. When MaxSplit is -1, this will just split
344	// "forever". This doesn't support splitting more than 2^31 times
345	// intentionally; if we ever want that we can make MaxSplit a 64-bit integer
346	// but that seems unlikely to be useful.
347	while (MaxSplit-- != `0`) {
348	size_t Idx = S.find(C: Separator);
349	if (Idx == npos)
350	break;
351
352	// Push this split.
353	if (KeepEmpty \|\| Idx > `0`)
354	A.push_back(Elt: S.slice(Start: `0`, End: Idx));
355
356	// Jump forward.
357	S = S.slice(Start: Idx + `1`, End: npos);
358	}
359
360	// Push the tail.
361	if (KeepEmpty \|\| !S.empty())
362	A.push_back(Elt: S);
363	}
364
365	//===----------------------------------------------------------------------===//
366	// Helpful Algorithms
367	//===----------------------------------------------------------------------===//
368
369	/// count - Return the number of non-overlapped occurrences of \arg Str in
370	/// the string.
371	size_t StringRef::count(StringRef Str) const {
372	size_t Count = `0`;
373	size_t Pos = `0`;
374	size_t N = Str.size();
375	// TODO: For an empty `Str` we return 0 for legacy reasons. Consider changing
376	// this to `Length + 1` which is more in-line with the function
377	// description.
378	if (!N)
379	return `0`;
380	while ((Pos = find(Str, From: Pos)) != npos) {
381	++Count;
382	Pos += N;
383	}
384	return Count;
385	}
386
387	static unsigned GetAutoSenseRadix(StringRef &Str) {
388	if (Str.empty())
389	return `10`;
390
391	if (Str.consume_front_insensitive(Prefix: "0x"))
392	return `16`;
393
394	if (Str.consume_front_insensitive(Prefix: "0b"))
395	return `2`;
396
397	if (Str.consume_front(Prefix: "0o"))
398	return `8`;
399
400	if (Str [`0`] == `'0'` && Str.size() > `1` && isDigit(C: Str [`1`])) {
401	Str = Str.substr(Start: `1`);
402	return `8`;
403	}
404
405	return `10`;
406	}
407
408	bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
409	unsigned long long &Result) {
410	// Autosense radix if not specified.
411	if (Radix == `0`)
412	Radix = GetAutoSenseRadix(Str);
413
414	// Empty strings (after the radix autosense) are invalid.
415	if (Str.empty()) return true;
416
417	// Parse all the bytes of the string given this radix. Watch for overflow.
418	StringRef Str2 = Str;
419	Result = `0`;
420	while (!Str2.empty()) {
421	unsigned CharVal;
422	if (Str2 [`0`] >= `'0'` && Str2 [`0`] <= `'9'`)
423	CharVal = Str2 [`0`] - `'0'`;
424	else if (Str2 [`0`] >= `'a'` && Str2 [`0`] <= `'z'`)
425	CharVal = Str2 [`0`] - `'a'` + `10`;
426	else if (Str2 [`0`] >= `'A'` && Str2 [`0`] <= `'Z'`)
427	CharVal = Str2 [`0`] - `'A'` + `10`;
428	else
429	break;
430
431	// If the parsed value is larger than the integer radix, we cannot
432	// consume any more characters.
433	if (CharVal >= Radix)
434	break;
435
436	// Add in this character.
437	unsigned long long PrevResult = Result;
438	Result = Result * Radix + CharVal;
439
440	// Check for overflow by shifting back and seeing if bits were lost.
441	if (Result / Radix < PrevResult)
442	return true;
443
444	Str2 = Str2.substr(Start: `1`);
445	}
446
447	// We consider the operation a failure if no characters were consumed
448	// successfully.
449	if (Str.size() == Str2.size())
450	return true;
451
452	Str = Str2;
453	return false;
454	}
455
456	bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
457	long long &Result) {
458	unsigned long long ULLVal;
459
460	// Handle positive strings first.
461	if (!Str.starts_with(Prefix: "-")) {
462	if (consumeUnsignedInteger(Str, Radix, Result&: ULLVal) \|\|
463	// Check for value so large it overflows a signed value.
464	(long long)ULLVal < `0`)
465	return true;
466	Result = ULLVal;
467	return false;
468	}
469
470	// Get the positive part of the value.
471	StringRef Str2 = Str.drop_front(N: `1`);
472	if (consumeUnsignedInteger(Str&: Str2, Radix, Result&: ULLVal) \|\|
473	// Reject values so large they'd overflow as negative signed, but allow
474	// "-0". This negates the unsigned so that the negative isn't undefined
475	// on signed overflow.
476	(long long)-ULLVal > `0`)
477	return true;
478
479	Str = Str2;
480	Result = -ULLVal;
481	return false;
482	}
483
484	/// GetAsUnsignedInteger - Workhorse method that converts a integer character
485	/// sequence of radix up to 36 to an unsigned long long value.
486	bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
487	unsigned long long &Result) {
488	if (consumeUnsignedInteger(Str, Radix, Result))
489	return true;
490
491	// For getAsUnsignedInteger, we require the whole string to be consumed or
492	// else we consider it a failure.
493	return !Str.empty();
494	}
495
496	bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
497	long long &Result) {
498	if (consumeSignedInteger(Str, Radix, Result))
499	return true;
500
501	// For getAsSignedInteger, we require the whole string to be consumed or else
502	// we consider it a failure.
503	return !Str.empty();
504	}
505
506	bool StringRef::consumeInteger(unsigned Radix, APInt &Result) {
507	StringRef Str = *this;
508
509	// Autosense radix if not specified.
510	if (Radix == `0`)
511	Radix = GetAutoSenseRadix(Str);
512
513	assert(Radix > `1` && Radix <= `36`);
514
515	// Empty strings (after the radix autosense) are invalid.
516	if (Str.empty()) return true;
517
518	// Skip leading zeroes. This can be a significant improvement if
519	// it means we don't need > 64 bits.
520	Str = Str.ltrim(Char: `'0'`);
521
522	// If it was nothing but zeroes....
523	if (Str.empty()) {
524	Result = APInt (`64`, `0`);
525	*this = Str;
526	return false;
527	}
528
529	// (Over-)estimate the required number of bits.
530	unsigned Log2Radix = `0`;
531	while ((`1U` << Log2Radix) < Radix) Log2Radix++;
532	bool IsPowerOf2Radix = ((`1U` << Log2Radix) == Radix);
533
534	unsigned BitWidth = Log2Radix * Str.size();
535	if (BitWidth < Result.getBitWidth())
536	BitWidth = Result.getBitWidth(); // don't shrink the result
537	else if (BitWidth > Result.getBitWidth())
538	Result = Result.zext(width: BitWidth);
539
540	APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
541	if (!IsPowerOf2Radix) {
542	// These must have the same bit-width as Result.
543	RadixAP = APInt (BitWidth, Radix);
544	CharAP = APInt (BitWidth, `0`);
545	}
546
547	// Parse all the bytes of the string given this radix.
548	Result = `0`;
549	while (!Str.empty()) {
550	unsigned CharVal;
551	if (Str [`0`] >= `'0'` && Str [`0`] <= `'9'`)
552	CharVal = Str [`0`]-`'0'`;
553	else if (Str [`0`] >= `'a'` && Str [`0`] <= `'z'`)
554	CharVal = Str [`0`]-`'a'`+`10`;
555	else if (Str [`0`] >= `'A'` && Str [`0`] <= `'Z'`)
556	CharVal = Str [`0`]-`'A'`+`10`;
557	else
558	break;
559
560	// If the parsed value is larger than the integer radix, the string is
561	// invalid.
562	if (CharVal >= Radix)
563	break;
564
565	// Add in this character.
566	if (IsPowerOf2Radix) {
567	Result <<= Log2Radix;
568	Result \|= CharVal;
569	} else {
570	Result *= RadixAP;
571	CharAP = CharVal;
572	Result += CharAP;
573	}
574
575	Str = Str.substr(Start: `1`);
576	}
577
578	// We consider the operation a failure if no characters were consumed
579	// successfully.
580	if (size() == Str.size())
581	return true;
582
583	*this = Str;
584	return false;
585	}
586
587	bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
588	StringRef Str = *this;
589	if (Str.consumeInteger(Radix, Result))
590	return true;
591
592	// For getAsInteger, we require the whole string to be consumed or else we
593	// consider it a failure.
594	return !Str.empty();
595	}
596
597	bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
598	APFloat F(`0.0`);
599	auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven);
600	if (errorToBool(Err: StatusOrErr.takeError()))
601	return true;
602
603	APFloat::opStatus Status = *StatusOrErr;
604	if (Status != APFloat::opOK) {
605	if (!AllowInexact \|\| !(Status & APFloat::opInexact))
606	return true;
607	}
608
609	Result = F.convertToDouble();
610	return false;
611	}
612
613	// Implementation of StringRef hashing.
614	hash_code llvm::hash_value(StringRef S) {
615	return hash_combine_range(first: S.begin(), last: S.end());
616	}
617
618	unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) {
619	assert(Val.data() != getEmptyKey().data() &&
620	"Cannot hash the empty key!");
621	assert(Val.data() != getTombstoneKey().data() &&
622	"Cannot hash the tombstone key!");
623	return (unsigned)(hash_value(S: Val));
624	}
625

source code of llvm/lib/Support/StringRef.cpp