1/*
2 * Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#pragma once
27
28#include "ConcurrentJSLock.h"
29#include "YarrPattern.h"
30
31namespace WTF {
32class BumpPointerAllocator;
33}
34using WTF::BumpPointerAllocator;
35
36namespace JSC { namespace Yarr {
37
38class ByteDisjunction;
39
40struct ByteTerm {
41 enum Type {
42 TypeBodyAlternativeBegin,
43 TypeBodyAlternativeDisjunction,
44 TypeBodyAlternativeEnd,
45 TypeAlternativeBegin,
46 TypeAlternativeDisjunction,
47 TypeAlternativeEnd,
48 TypeSubpatternBegin,
49 TypeSubpatternEnd,
50 TypeAssertionBOL,
51 TypeAssertionEOL,
52 TypeAssertionWordBoundary,
53 TypePatternCharacterOnce,
54 TypePatternCharacterFixed,
55 TypePatternCharacterGreedy,
56 TypePatternCharacterNonGreedy,
57 TypePatternCasedCharacterOnce,
58 TypePatternCasedCharacterFixed,
59 TypePatternCasedCharacterGreedy,
60 TypePatternCasedCharacterNonGreedy,
61 TypeCharacterClass,
62 TypeBackReference,
63 TypeParenthesesSubpattern,
64 TypeParenthesesSubpatternOnceBegin,
65 TypeParenthesesSubpatternOnceEnd,
66 TypeParenthesesSubpatternTerminalBegin,
67 TypeParenthesesSubpatternTerminalEnd,
68 TypeParentheticalAssertionBegin,
69 TypeParentheticalAssertionEnd,
70 TypeCheckInput,
71 TypeUncheckInput,
72 TypeDotStarEnclosure,
73 } type;
74 union {
75 struct {
76 union {
77 UChar32 patternCharacter;
78 struct {
79 UChar32 lo;
80 UChar32 hi;
81 } casedCharacter;
82 CharacterClass* characterClass;
83 unsigned subpatternId;
84 };
85 union {
86 ByteDisjunction* parenthesesDisjunction;
87 unsigned parenthesesWidth;
88 };
89 QuantifierType quantityType;
90 unsigned quantityMinCount;
91 unsigned quantityMaxCount;
92 } atom;
93 struct {
94 int next;
95 int end;
96 bool onceThrough;
97 } alternative;
98 struct {
99 bool m_bol : 1;
100 bool m_eol : 1;
101 } anchors;
102 unsigned checkInputCount;
103 };
104 unsigned frameLocation;
105 bool m_capture : 1;
106 bool m_invert : 1;
107 unsigned inputPosition;
108
109 ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
110 : frameLocation(frameLocation)
111 , m_capture(false)
112 , m_invert(false)
113 {
114 atom.patternCharacter = ch;
115 atom.quantityType = quantityType;
116 atom.quantityMinCount = quantityCount.unsafeGet();
117 atom.quantityMaxCount = quantityCount.unsafeGet();
118 inputPosition = inputPos;
119
120 switch (quantityType) {
121 case QuantifierFixedCount:
122 type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
123 break;
124 case QuantifierGreedy:
125 type = ByteTerm::TypePatternCharacterGreedy;
126 break;
127 case QuantifierNonGreedy:
128 type = ByteTerm::TypePatternCharacterNonGreedy;
129 break;
130 }
131 }
132
133 ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
134 : frameLocation(frameLocation)
135 , m_capture(false)
136 , m_invert(false)
137 {
138 switch (quantityType) {
139 case QuantifierFixedCount:
140 type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
141 break;
142 case QuantifierGreedy:
143 type = ByteTerm::TypePatternCasedCharacterGreedy;
144 break;
145 case QuantifierNonGreedy:
146 type = ByteTerm::TypePatternCasedCharacterNonGreedy;
147 break;
148 }
149
150 atom.casedCharacter.lo = lo;
151 atom.casedCharacter.hi = hi;
152 atom.quantityType = quantityType;
153 atom.quantityMinCount = quantityCount.unsafeGet();
154 atom.quantityMaxCount = quantityCount.unsafeGet();
155 inputPosition = inputPos;
156 }
157
158 ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos)
159 : type(ByteTerm::TypeCharacterClass)
160 , m_capture(false)
161 , m_invert(invert)
162 {
163 atom.characterClass = characterClass;
164 atom.quantityType = QuantifierFixedCount;
165 atom.quantityMinCount = 1;
166 atom.quantityMaxCount = 1;
167 inputPosition = inputPos;
168 }
169
170 ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos)
171 : type(type)
172 , m_capture(capture)
173 , m_invert(false)
174 {
175 atom.subpatternId = subpatternId;
176 atom.parenthesesDisjunction = parenthesesInfo;
177 atom.quantityType = QuantifierFixedCount;
178 atom.quantityMinCount = 1;
179 atom.quantityMaxCount = 1;
180 inputPosition = inputPos;
181 }
182
183 ByteTerm(Type type, bool invert = false)
184 : type(type)
185 , m_capture(false)
186 , m_invert(invert)
187 {
188 atom.quantityType = QuantifierFixedCount;
189 atom.quantityMinCount = 1;
190 atom.quantityMaxCount = 1;
191 }
192
193 ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos)
194 : type(type)
195 , m_capture(capture)
196 , m_invert(invert)
197 {
198 atom.subpatternId = subpatternId;
199 atom.quantityType = QuantifierFixedCount;
200 atom.quantityMinCount = 1;
201 atom.quantityMaxCount = 1;
202 inputPosition = inputPos;
203 }
204
205 static ByteTerm BOL(unsigned inputPos)
206 {
207 ByteTerm term(TypeAssertionBOL);
208 term.inputPosition = inputPos;
209 return term;
210 }
211
212 static ByteTerm CheckInput(Checked<unsigned> count)
213 {
214 ByteTerm term(TypeCheckInput);
215 term.checkInputCount = count.unsafeGet();
216 return term;
217 }
218
219 static ByteTerm UncheckInput(Checked<unsigned> count)
220 {
221 ByteTerm term(TypeUncheckInput);
222 term.checkInputCount = count.unsafeGet();
223 return term;
224 }
225
226 static ByteTerm EOL(unsigned inputPos)
227 {
228 ByteTerm term(TypeAssertionEOL);
229 term.inputPosition = inputPos;
230 return term;
231 }
232
233 static ByteTerm WordBoundary(bool invert, unsigned inputPos)
234 {
235 ByteTerm term(TypeAssertionWordBoundary, invert);
236 term.inputPosition = inputPos;
237 return term;
238 }
239
240 static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos)
241 {
242 return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos);
243 }
244
245 static ByteTerm BodyAlternativeBegin(bool onceThrough)
246 {
247 ByteTerm term(TypeBodyAlternativeBegin);
248 term.alternative.next = 0;
249 term.alternative.end = 0;
250 term.alternative.onceThrough = onceThrough;
251 return term;
252 }
253
254 static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
255 {
256 ByteTerm term(TypeBodyAlternativeDisjunction);
257 term.alternative.next = 0;
258 term.alternative.end = 0;
259 term.alternative.onceThrough = onceThrough;
260 return term;
261 }
262
263 static ByteTerm BodyAlternativeEnd()
264 {
265 ByteTerm term(TypeBodyAlternativeEnd);
266 term.alternative.next = 0;
267 term.alternative.end = 0;
268 term.alternative.onceThrough = false;
269 return term;
270 }
271
272 static ByteTerm AlternativeBegin()
273 {
274 ByteTerm term(TypeAlternativeBegin);
275 term.alternative.next = 0;
276 term.alternative.end = 0;
277 term.alternative.onceThrough = false;
278 return term;
279 }
280
281 static ByteTerm AlternativeDisjunction()
282 {
283 ByteTerm term(TypeAlternativeDisjunction);
284 term.alternative.next = 0;
285 term.alternative.end = 0;
286 term.alternative.onceThrough = false;
287 return term;
288 }
289
290 static ByteTerm AlternativeEnd()
291 {
292 ByteTerm term(TypeAlternativeEnd);
293 term.alternative.next = 0;
294 term.alternative.end = 0;
295 term.alternative.onceThrough = false;
296 return term;
297 }
298
299 static ByteTerm SubpatternBegin()
300 {
301 return ByteTerm(TypeSubpatternBegin);
302 }
303
304 static ByteTerm SubpatternEnd()
305 {
306 return ByteTerm(TypeSubpatternEnd);
307 }
308
309 static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor)
310 {
311 ByteTerm term(TypeDotStarEnclosure);
312 term.anchors.m_bol = bolAnchor;
313 term.anchors.m_eol = eolAnchor;
314 return term;
315 }
316
317 bool invert()
318 {
319 return m_invert;
320 }
321
322 bool capture()
323 {
324 return m_capture;
325 }
326};
327
328class ByteDisjunction {
329 WTF_MAKE_FAST_ALLOCATED;
330public:
331 ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
332 : m_numSubpatterns(numSubpatterns)
333 , m_frameSize(frameSize)
334 {
335 }
336
337 size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); }
338
339 Vector<ByteTerm> terms;
340 unsigned m_numSubpatterns;
341 unsigned m_frameSize;
342};
343
344struct BytecodePattern {
345 WTF_MAKE_FAST_ALLOCATED;
346public:
347 BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
348 : m_body(WTFMove(body))
349 , m_flags(pattern.m_flags)
350 , m_allocator(allocator)
351 , m_lock(lock)
352 {
353 m_body->terms.shrinkToFit();
354
355 newlineCharacterClass = pattern.newlineCharacterClass();
356 if (unicode() && ignoreCase())
357 wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass();
358 else
359 wordcharCharacterClass = pattern.wordcharCharacterClass();
360
361 m_allParenthesesInfo.swap(x&: parenthesesInfoToAdopt);
362 m_allParenthesesInfo.shrinkToFit();
363
364 m_userCharacterClasses.swap(x&: pattern.m_userCharacterClasses);
365 m_userCharacterClasses.shrinkToFit();
366 }
367
368 size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); }
369
370 bool ignoreCase() const { return m_flags & FlagIgnoreCase; }
371 bool multiline() const { return m_flags & FlagMultiline; }
372 bool sticky() const { return m_flags & FlagSticky; }
373 bool unicode() const { return m_flags & FlagUnicode; }
374 bool dotAll() const { return m_flags & FlagDotAll; }
375
376 std::unique_ptr<ByteDisjunction> m_body;
377 RegExpFlags m_flags;
378 // Each BytecodePattern is associated with a RegExp, each RegExp is associated
379 // with a VM. Cache a pointer to out VM's m_regExpAllocator.
380 BumpPointerAllocator* m_allocator;
381 ConcurrentJSLock* m_lock;
382
383 CharacterClass* newlineCharacterClass;
384 CharacterClass* wordcharCharacterClass;
385
386private:
387 Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
388 Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses;
389};
390
391JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ConcurrentJSLock* = nullptr);
392JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output);
393unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output);
394unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output);
395
396} } // namespace JSC::Yarr
397

source code of qtdeclarative/src/3rdparty/masm/yarr/YarrInterpreter.h