1/****************************************************************************
2**
3** Copyright (C) 2016 Intel Corporation.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40#include "qurl.h"
41#include "private/qutfcodec_p.h"
42#include "private/qtools_p.h"
43#include "private/qsimd_p.h"
44
45QT_BEGIN_NAMESPACE
46
47// ### move to qurl_p.h
48enum EncodingAction {
49 DecodeCharacter = 0,
50 LeaveCharacter = 1,
51 EncodeCharacter = 2
52};
53
54// From RFC 3896, Appendix A Collected ABNF for URI
55// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
56// reserved = gen-delims / sub-delims
57// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
58// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
59// / "*" / "+" / "," / ";" / "="
60static const uchar defaultActionTable[96] = {
61 2, // space
62 1, // '!' (sub-delim)
63 2, // '"'
64 1, // '#' (gen-delim)
65 1, // '$' (gen-delim)
66 2, // '%' (percent)
67 1, // '&' (gen-delim)
68 1, // "'" (sub-delim)
69 1, // '(' (sub-delim)
70 1, // ')' (sub-delim)
71 1, // '*' (sub-delim)
72 1, // '+' (sub-delim)
73 1, // ',' (sub-delim)
74 0, // '-' (unreserved)
75 0, // '.' (unreserved)
76 1, // '/' (gen-delim)
77
78 0, 0, 0, 0, 0, // '0' to '4' (unreserved)
79 0, 0, 0, 0, 0, // '5' to '9' (unreserved)
80 1, // ':' (gen-delim)
81 1, // ';' (sub-delim)
82 2, // '<'
83 1, // '=' (sub-delim)
84 2, // '>'
85 1, // '?' (gen-delim)
86
87 1, // '@' (gen-delim)
88 0, 0, 0, 0, 0, // 'A' to 'E' (unreserved)
89 0, 0, 0, 0, 0, // 'F' to 'J' (unreserved)
90 0, 0, 0, 0, 0, // 'K' to 'O' (unreserved)
91 0, 0, 0, 0, 0, // 'P' to 'T' (unreserved)
92 0, 0, 0, 0, 0, 0, // 'U' to 'Z' (unreserved)
93 1, // '[' (gen-delim)
94 2, // '\'
95 1, // ']' (gen-delim)
96 2, // '^'
97 0, // '_' (unreserved)
98
99 2, // '`'
100 0, 0, 0, 0, 0, // 'a' to 'e' (unreserved)
101 0, 0, 0, 0, 0, // 'f' to 'j' (unreserved)
102 0, 0, 0, 0, 0, // 'k' to 'o' (unreserved)
103 0, 0, 0, 0, 0, // 'p' to 't' (unreserved)
104 0, 0, 0, 0, 0, 0, // 'u' to 'z' (unreserved)
105 2, // '{'
106 2, // '|'
107 2, // '}'
108 0, // '~' (unreserved)
109
110 2 // BSKP
111};
112
113// mask tables, in negative polarity
114// 0x00 if it belongs to this category
115// 0xff if it doesn't
116
117static const uchar reservedMask[96] = {
118 0xff, // space
119 0xff, // '!' (sub-delim)
120 0x00, // '"'
121 0xff, // '#' (gen-delim)
122 0xff, // '$' (gen-delim)
123 0xff, // '%' (percent)
124 0xff, // '&' (gen-delim)
125 0xff, // "'" (sub-delim)
126 0xff, // '(' (sub-delim)
127 0xff, // ')' (sub-delim)
128 0xff, // '*' (sub-delim)
129 0xff, // '+' (sub-delim)
130 0xff, // ',' (sub-delim)
131 0xff, // '-' (unreserved)
132 0xff, // '.' (unreserved)
133 0xff, // '/' (gen-delim)
134
135 0xff, 0xff, 0xff, 0xff, 0xff, // '0' to '4' (unreserved)
136 0xff, 0xff, 0xff, 0xff, 0xff, // '5' to '9' (unreserved)
137 0xff, // ':' (gen-delim)
138 0xff, // ';' (sub-delim)
139 0x00, // '<'
140 0xff, // '=' (sub-delim)
141 0x00, // '>'
142 0xff, // '?' (gen-delim)
143
144 0xff, // '@' (gen-delim)
145 0xff, 0xff, 0xff, 0xff, 0xff, // 'A' to 'E' (unreserved)
146 0xff, 0xff, 0xff, 0xff, 0xff, // 'F' to 'J' (unreserved)
147 0xff, 0xff, 0xff, 0xff, 0xff, // 'K' to 'O' (unreserved)
148 0xff, 0xff, 0xff, 0xff, 0xff, // 'P' to 'T' (unreserved)
149 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'U' to 'Z' (unreserved)
150 0xff, // '[' (gen-delim)
151 0x00, // '\'
152 0xff, // ']' (gen-delim)
153 0x00, // '^'
154 0xff, // '_' (unreserved)
155
156 0x00, // '`'
157 0xff, 0xff, 0xff, 0xff, 0xff, // 'a' to 'e' (unreserved)
158 0xff, 0xff, 0xff, 0xff, 0xff, // 'f' to 'j' (unreserved)
159 0xff, 0xff, 0xff, 0xff, 0xff, // 'k' to 'o' (unreserved)
160 0xff, 0xff, 0xff, 0xff, 0xff, // 'p' to 't' (unreserved)
161 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // 'u' to 'z' (unreserved)
162 0x00, // '{'
163 0x00, // '|'
164 0x00, // '}'
165 0xff, // '~' (unreserved)
166
167 0xff // BSKP
168};
169
170static inline bool isHex(ushort c)
171{
172 return (c >= 'a' && c <= 'f') ||
173 (c >= 'A' && c <= 'F') ||
174 (c >= '0' && c <= '9');
175}
176
177static inline bool isUpperHex(ushort c)
178{
179 // undefined behaviour if c isn't an hex char!
180 return c < 0x60;
181}
182
183static inline ushort toUpperHex(ushort c)
184{
185 return isUpperHex(c) ? c : c - 0x20;
186}
187
188static inline ushort decodeNibble(ushort c)
189{
190 return c >= 'a' ? c - 'a' + 0xA :
191 c >= 'A' ? c - 'A' + 0xA : c - '0';
192}
193
194// if the sequence at input is 2*HEXDIG, returns its decoding
195// returns -1 if it isn't.
196// assumes that the range has been checked already
197static inline ushort decodePercentEncoding(const ushort *input)
198{
199 ushort c1 = input[1];
200 ushort c2 = input[2];
201 if (!isHex(c: c1) || !isHex(c: c2))
202 return ushort(-1);
203 return decodeNibble(c: c1) << 4 | decodeNibble(c: c2);
204}
205
206static inline ushort encodeNibble(ushort c)
207{
208 return ushort(QtMiscUtils::toHexUpper(value: c));
209}
210
211static void ensureDetached(QString &result, ushort *&output, const ushort *begin, const ushort *input, const ushort *end,
212 int add = 0)
213{
214 if (!output) {
215 // now detach
216 // create enough space if the rest of the string needed to be percent-encoded
217 int charsProcessed = input - begin;
218 int charsRemaining = end - input;
219 int spaceNeeded = end - begin + 2 * charsRemaining + add;
220 int origSize = result.size();
221 result.resize(size: origSize + spaceNeeded);
222
223 // we know that resize() above detached, so we bypass the reference count check
224 output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData()))
225 + origSize;
226
227 // copy the chars we've already processed
228 int i;
229 for (i = 0; i < charsProcessed; ++i)
230 output[i] = begin[i];
231 output += i;
232 }
233}
234
235namespace {
236struct QUrlUtf8Traits : public QUtf8BaseTraitsNoAscii
237{
238 // From RFC 3987:
239 // iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
240 //
241 // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
242 // / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
243 // / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
244 // / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
245 // / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
246 // / %xD0000-DFFFD / %xE1000-EFFFD
247 //
248 // iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
249 //
250 // That RFC allows iprivate only as part of iquery, but we don't know here
251 // whether we're looking at a query or another part of an URI, so we accept
252 // them too. The definition above excludes U+FFF0 to U+FFFD from appearing
253 // unencoded, but we see no reason for its exclusion, so we allow them to
254 // be decoded (and we need U+FFFD the replacement character to indicate
255 // failure to decode).
256 //
257 // That means we must disallow:
258 // * unpaired surrogates (QUtf8Functions takes care of that for us)
259 // * non-characters
260 static const bool allowNonCharacters = false;
261
262 // override: our "bytes" are three percent-encoded UTF-16 characters
263 static void appendByte(ushort *&ptr, uchar b)
264 {
265 // b >= 0x80, by construction, so percent-encode
266 *ptr++ = '%';
267 *ptr++ = encodeNibble(c: b >> 4);
268 *ptr++ = encodeNibble(c: b & 0xf);
269 }
270
271 static uchar peekByte(const ushort *ptr, int n = 0)
272 {
273 // decodePercentEncoding returns ushort(-1) if it can't decode,
274 // which means we return 0xff, which is not a valid continuation byte.
275 // If ptr[i * 3] is not '%', we'll multiply by zero and return 0,
276 // also not a valid continuation byte (if it's '%', we multiply by 1).
277 return uchar(decodePercentEncoding(input: ptr + n * 3))
278 * uchar(ptr[n * 3] == '%');
279 }
280
281 static qptrdiff availableBytes(const ushort *ptr, const ushort *end)
282 {
283 return (end - ptr) / 3;
284 }
285
286 static void advanceByte(const ushort *&ptr, int n = 1)
287 {
288 ptr += n * 3;
289 }
290};
291}
292
293// returns true if we performed an UTF-8 decoding
294static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input,
295 const ushort *end, ushort decoded)
296{
297 uint ucs4, *dst = &ucs4;
298 const ushort *src = input + 3;// skip the %XX that yielded \a decoded
299 int charsNeeded = QUtf8Functions::fromUtf8<QUrlUtf8Traits>(b: decoded, dst, src, end);
300 if (charsNeeded < 0)
301 return false;
302
303 if (!QChar::requiresSurrogates(ucs4)) {
304 // UTF-8 decoded and no surrogates are required
305 // detach if necessary
306 // possibilities are: 6 chars (%XX%XX) -> one char; 9 chars (%XX%XX%XX) -> one char
307 ensureDetached(result, output, begin, input, end, add: -3 * charsNeeded + 1);
308 *output++ = ucs4;
309 } else {
310 // UTF-8 decoded to something that requires a surrogate pair
311 // compressing from %XX%XX%XX%XX (12 chars) to two
312 ensureDetached(result, output, begin, input, end, add: -10);
313 *output++ = QChar::highSurrogate(ucs4);
314 *output++ = QChar::lowSurrogate(ucs4);
315 }
316
317 input = src - 1;
318 return true;
319}
320
321static void unicodeToEncodedUtf8(QString &result, ushort *&output, const ushort *begin,
322 const ushort *&input, const ushort *end, ushort decoded)
323{
324 // calculate the utf8 length and ensure enough space is available
325 int utf8len = QChar::isHighSurrogate(ucs4: decoded) ? 4 : decoded >= 0x800 ? 3 : 2;
326
327 // detach
328 if (!output) {
329 // we need 3 * utf8len for the encoded UTF-8 sequence
330 // but ensureDetached already adds 3 for the char we're processing
331 ensureDetached(result, output, begin, input, end, add: 3*utf8len - 3);
332 } else {
333 // verify that there's enough space or expand
334 int charsRemaining = end - input - 1; // not including this one
335 int pos = output - reinterpret_cast<const ushort *>(result.constData());
336 int spaceRemaining = result.size() - pos;
337 if (spaceRemaining < 3*charsRemaining + 3*utf8len) {
338 // must resize
339 result.resize(size: result.size() + 3*utf8len);
340
341 // we know that resize() above detached, so we bypass the reference count check
342 output = const_cast<ushort *>(reinterpret_cast<const ushort *>(result.constData()));
343 output += pos;
344 }
345 }
346
347 ++input;
348 int res = QUtf8Functions::toUtf8<QUrlUtf8Traits>(u: decoded, dst&: output, src&: input, end);
349 --input;
350 if (res < 0) {
351 // bad surrogate pair sequence
352 // we will encode bad UTF-16 to UTF-8
353 // but they don't get decoded back
354
355 // first of three bytes
356 uchar c = 0xe0 | uchar(decoded >> 12);
357 *output++ = '%';
358 *output++ = 'E';
359 *output++ = encodeNibble(c: c & 0xf);
360
361 // second byte
362 c = 0x80 | (uchar(decoded >> 6) & 0x3f);
363 *output++ = '%';
364 *output++ = encodeNibble(c: c >> 4);
365 *output++ = encodeNibble(c: c & 0xf);
366
367 // third byte
368 c = 0x80 | (decoded & 0x3f);
369 *output++ = '%';
370 *output++ = encodeNibble(c: c >> 4);
371 *output++ = encodeNibble(c: c & 0xf);
372 }
373}
374
375static int recode(QString &result, const ushort *begin, const ushort *end, QUrl::ComponentFormattingOptions encoding,
376 const uchar *actionTable, bool retryBadEncoding)
377{
378 const int origSize = result.size();
379 const ushort *input = begin;
380 ushort *output = nullptr;
381
382 EncodingAction action = EncodeCharacter;
383 for ( ; input != end; ++input) {
384 ushort c;
385 // try a run where no change is necessary
386 for ( ; input != end; ++input) {
387 c = *input;
388 if (c < 0x20U)
389 action = EncodeCharacter;
390 if (c < 0x20U || c >= 0x80U) // also: (c - 0x20 < 0x60U)
391 goto non_trivial;
392 action = EncodingAction(actionTable[c - ' ']);
393 if (action == EncodeCharacter)
394 goto non_trivial;
395 if (output)
396 *output++ = c;
397 }
398 break;
399
400non_trivial:
401 uint decoded;
402 if (c == '%' && retryBadEncoding) {
403 // always write "%25"
404 ensureDetached(result, output, begin, input, end);
405 *output++ = '%';
406 *output++ = '2';
407 *output++ = '5';
408 continue;
409 } else if (c == '%') {
410 // check if the input is valid
411 if (input + 2 >= end || (decoded = decodePercentEncoding(input)) == ushort(-1)) {
412 // not valid, retry
413 result.resize(size: origSize);
414 return recode(result, begin, end, encoding, actionTable, retryBadEncoding: true);
415 }
416
417 if (decoded >= 0x80) {
418 // decode the UTF-8 sequence
419 if (!(encoding & QUrl::EncodeUnicode) &&
420 encodedUtf8ToUtf16(result, output, begin, input, end, decoded))
421 continue;
422
423 // decoding the encoded UTF-8 failed
424 action = LeaveCharacter;
425 } else if (decoded >= 0x20) {
426 action = EncodingAction(actionTable[decoded - ' ']);
427 }
428 } else {
429 decoded = c;
430 if (decoded >= 0x80 && encoding & QUrl::EncodeUnicode) {
431 // encode the UTF-8 sequence
432 unicodeToEncodedUtf8(result, output, begin, input, end, decoded);
433 continue;
434 } else if (decoded >= 0x80) {
435 if (output)
436 *output++ = c;
437 continue;
438 }
439 }
440
441 // there are six possibilities:
442 // current \ action | DecodeCharacter | LeaveCharacter | EncodeCharacter
443 // decoded | 1:leave | 2:leave | 3:encode
444 // encoded | 4:decode | 5:leave | 6:leave
445 // cases 1 and 2 were handled before this section
446
447 if (c == '%' && action != DecodeCharacter) {
448 // cases 5 and 6: it's encoded and we're leaving it as it is
449 // except we're pedantic and we'll uppercase the hex
450 if (output || !isUpperHex(c: input[1]) || !isUpperHex(c: input[2])) {
451 ensureDetached(result, output, begin, input, end);
452 *output++ = '%';
453 *output++ = toUpperHex(c: *++input);
454 *output++ = toUpperHex(c: *++input);
455 }
456 } else if (c == '%' && action == DecodeCharacter) {
457 // case 4: we need to decode
458 ensureDetached(result, output, begin, input, end);
459 *output++ = decoded;
460 input += 2;
461 } else {
462 // must be case 3: we need to encode
463 ensureDetached(result, output, begin, input, end);
464 *output++ = '%';
465 *output++ = encodeNibble(c: c >> 4);
466 *output++ = encodeNibble(c: c & 0xf);
467 }
468 }
469
470 if (output) {
471 int len = output - reinterpret_cast<const ushort *>(result.constData());
472 result.truncate(pos: len);
473 return len - origSize;
474 }
475 return 0;
476}
477
478/*
479 * Returns true if the input it checked (if it checked anything) is not
480 * encoded. A return of false indicates there's a percent at \a input that
481 * needs to be decoded.
482 */
483#ifdef __SSE2__
484static bool simdCheckNonEncoded(ushort *&output, const ushort *&input, const ushort *end)
485{
486# ifdef __AVX2__
487 const __m256i percents256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('%'));
488 const __m128i percents = _mm256_castsi256_si128(percents256);
489# else
490 const __m128i percents = _mm_set1_epi16(w: '%');
491# endif
492
493 uint idx = 0;
494 quint32 mask = 0;
495 if (input + 16 <= end) {
496 qptrdiff offset = 0;
497 for ( ; input + offset + 16 <= end; offset += 16) {
498# ifdef __AVX2__
499 // do 32 bytes at a time using AVX2
500 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(input + offset));
501 __m256i comparison = _mm256_cmpeq_epi16(data, percents256);
502 mask = _mm256_movemask_epi8(comparison);
503 _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + offset), data);
504# else
505 // do 32 bytes at a time using unrolled SSE2
506 __m128i data1 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset));
507 __m128i data2 = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input + offset + 8));
508 __m128i comparison1 = _mm_cmpeq_epi16(a: data1, b: percents);
509 __m128i comparison2 = _mm_cmpeq_epi16(a: data2, b: percents);
510 uint mask1 = _mm_movemask_epi8(a: comparison1);
511 uint mask2 = _mm_movemask_epi8(a: comparison2);
512
513 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset), b: data1);
514 if (!mask1)
515 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output + offset + 8), b: data2);
516 mask = mask1 | (mask2 << 16);
517# endif
518
519 if (mask) {
520 idx = qCountTrailingZeroBits(v: mask) / 2;
521 break;
522 }
523 }
524
525 input += offset;
526 if (output)
527 output += offset;
528 } else if (input + 8 <= end) {
529 // do 16 bytes at a time
530 __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(input));
531 __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents);
532 mask = _mm_movemask_epi8(a: comparison);
533 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(output), b: data);
534 idx = qCountTrailingZeroBits(v: quint16(mask)) / 2;
535 } else if (input + 4 <= end) {
536 // do 8 bytes only
537 __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(input));
538 __m128i comparison = _mm_cmpeq_epi16(a: data, b: percents);
539 mask = _mm_movemask_epi8(a: comparison) & 0xffu;
540 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(output), a: data);
541 idx = qCountTrailingZeroBits(v: quint8(mask)) / 2;
542 } else {
543 // no percents found (because we didn't check)
544 return true;
545 }
546
547 // advance to the next non-encoded
548 input += idx;
549 output += idx;
550
551 return !mask;
552}
553#else
554static bool simdCheckNonEncoded(...)
555{
556 return true;
557}
558#endif
559
560/*!
561 \since 5.0
562 \internal
563
564 This function decodes a percent-encoded string located from \a begin to \a
565 end, by appending each character to \a appendTo. It returns the number of
566 characters appended. Each percent-encoded sequence is decoded as follows:
567
568 \list
569 \li from %00 to %7F: the exact decoded value is appended;
570 \li from %80 to %FF: QChar::ReplacementCharacter is appended;
571 \li bad encoding: original input is copied to the output, undecoded.
572 \endlist
573
574 Given the above, it's important for the input to already have all UTF-8
575 percent sequences decoded by qt_urlRecode (that is, the input should not
576 have been processed with QUrl::EncodeUnicode).
577
578 The input should also be a valid percent-encoded sequence (the output of
579 qt_urlRecode is always valid).
580*/
581static int decode(QString &appendTo, const ushort *begin, const ushort *end)
582{
583 // fast check whether there's anything to be decoded in the first place
584 const ushort *input = QtPrivate::qustrchr(str: QStringView(begin, end), ch: '%');
585 if (Q_LIKELY(input == end))
586 return 0; // nothing to do, it was already decoded!
587
588 // detach
589 const int origSize = appendTo.size();
590 appendTo.resize(size: origSize + (end - begin));
591 ushort *output = reinterpret_cast<ushort *>(appendTo.begin()) + origSize;
592 memcpy(dest: static_cast<void *>(output), src: static_cast<const void *>(begin), n: (input - begin) * sizeof(ushort));
593 output += input - begin;
594
595 while (input != end) {
596 // something was encoded
597 Q_ASSERT(*input == '%');
598
599 if (Q_UNLIKELY(end - input < 3 || !isHex(input[1]) || !isHex(input[2]))) {
600 // badly-encoded data
601 appendTo.resize(size: origSize + (end - begin));
602 memcpy(dest: static_cast<void *>(appendTo.begin() + origSize), src: static_cast<const void *>(begin), n: (end - begin) * sizeof(ushort));
603 return end - begin;
604 }
605
606 ++input;
607 *output++ = decodeNibble(c: input[0]) << 4 | decodeNibble(c: input[1]);
608 if (output[-1] >= 0x80)
609 output[-1] = QChar::ReplacementCharacter;
610 input += 2;
611
612 // search for the next percent, copying from input to output
613 if (simdCheckNonEncoded(output, input, end)) {
614 while (input != end) {
615 ushort uc = *input;
616 if (uc == '%')
617 break;
618 *output++ = uc;
619 ++input;
620 }
621 }
622 }
623
624 int len = output - reinterpret_cast<ushort *>(appendTo.begin());
625 appendTo.truncate(pos: len);
626 return len - origSize;
627}
628
629template <size_t N>
630static void maskTable(uchar (&table)[N], const uchar (&mask)[N])
631{
632 for (size_t i = 0; i < N; ++i)
633 table[i] &= mask[i];
634}
635
636/*!
637 \internal
638
639 Recodes the string from \a begin to \a end. If any transformations are
640 done, append them to \a appendTo and return the number of characters added.
641 If no transformations were required, return 0.
642
643 The \a encoding option modifies the default behaviour:
644 \list
645 \li QUrl::DecodeReserved: if set, reserved characters will be decoded;
646 if unset, reserved characters will be encoded
647 \li QUrl::EncodeSpaces: if set, spaces will be encoded to "%20"; if unset, they will be " "
648 \li QUrl::EncodeUnicode: if set, characters above U+0080 will be encoded to their UTF-8
649 percent-encoded form; if unset, they will be decoded to UTF-16
650 \li QUrl::FullyDecoded: if set, this function will decode all percent-encoded sequences,
651 including that of the percent character. The resulting string
652 will not be percent-encoded anymore. Use with caution!
653 In this mode, the behaviour is undefined if the input string
654 contains any percent-encoding sequences above %80.
655 Also, the function will not correct bad % sequences.
656 \endlist
657
658 Other flags are ignored (including QUrl::EncodeReserved).
659
660 The \a tableModifications argument can be used to supply extra
661 modifications to the tables, to be applied after the flags above are
662 handled. It consists of a sequence of 16-bit values, where the low 8 bits
663 indicate the character in question and the high 8 bits are either \c
664 EncodeCharacter, \c LeaveCharacter or \c DecodeCharacter.
665
666 This function corrects percent-encoded errors by interpreting every '%' as
667 meaning "%25" (all percents in the same content).
668 */
669
670Q_AUTOTEST_EXPORT int
671qt_urlRecode(QString &appendTo, const QChar *begin, const QChar *end,
672 QUrl::ComponentFormattingOptions encoding, const ushort *tableModifications)
673{
674 uchar actionTable[sizeof defaultActionTable];
675 if ((encoding & QUrl::FullyDecoded) == QUrl::FullyDecoded) {
676 return decode(appendTo, begin: reinterpret_cast<const ushort *>(begin), end: reinterpret_cast<const ushort *>(end));
677 }
678
679 memcpy(dest: actionTable, src: defaultActionTable, n: sizeof actionTable);
680 if (encoding & QUrl::DecodeReserved)
681 maskTable(table&: actionTable, mask: reservedMask);
682 if (!(encoding & QUrl::EncodeSpaces))
683 actionTable[0] = DecodeCharacter; // decode
684
685 if (tableModifications) {
686 for (const ushort *p = tableModifications; *p; ++p)
687 actionTable[uchar(*p) - ' '] = *p >> 8;
688 }
689
690 return recode(result&: appendTo, begin: reinterpret_cast<const ushort *>(begin), end: reinterpret_cast<const ushort *>(end),
691 encoding, actionTable, retryBadEncoding: false);
692}
693
694// qstring.cpp
695bool qt_is_ascii(const char *&ptr, const char *end) noexcept;
696
697/*!
698 \internal
699 \since 5.0
700
701 \a ba contains an 8-bit form of the component and it might be
702 percent-encoded already. We can't use QString::fromUtf8 because it might
703 contain non-UTF8 sequences. We can't use QByteArray::toPercentEncoding
704 because it might already contain percent-encoded sequences. We can't use
705 qt_urlRecode because it needs UTF-16 input.
706*/
707Q_AUTOTEST_EXPORT
708QString qt_urlRecodeByteArray(const QByteArray &ba)
709{
710 if (ba.isNull())
711 return QString();
712
713 // scan ba for anything above or equal to 0x80
714 // control points below 0x20 are fine in QString
715 const char *in = ba.constData();
716 const char *const end = ba.constEnd();
717 if (qt_is_ascii(ptr&: in, end)) {
718 // no non-ASCII found, we're safe to convert to QString
719 return QString::fromLatin1(str: ba, size: ba.size());
720 }
721
722 // we found something that we need to encode
723 QByteArray intermediate = ba;
724 intermediate.resize(size: ba.size() * 3 - (in - ba.constData()));
725 uchar *out = reinterpret_cast<uchar *>(intermediate.data() + (in - ba.constData()));
726 for ( ; in < end; ++in) {
727 if (*in & 0x80) {
728 // encode
729 *out++ = '%';
730 *out++ = encodeNibble(c: uchar(*in) >> 4);
731 *out++ = encodeNibble(c: uchar(*in) & 0xf);
732 } else {
733 // keep
734 *out++ = uchar(*in);
735 }
736 }
737
738 // now it's safe to call fromLatin1
739 return QString::fromLatin1(str: intermediate, size: out - reinterpret_cast<uchar *>(intermediate.data()));
740}
741
742QT_END_NAMESPACE
743

source code of qtbase/src/corelib/io/qurlrecode.cpp