1// Copyright (C) 2016 The Qt Company Ltd.
2// Copyright (C) 2016 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include "qurl_p.h"
6
7#include <QtCore/qstringlist.h>
8#include <QtCore/private/qnumeric_p.h>
9#include <QtCore/private/qoffsetstringarray_p.h>
10#include <QtCore/private/qstringiterator_p.h>
11#include <QtCore/private/qunicodetables_p.h>
12
13#include <algorithm>
14
15QT_BEGIN_NAMESPACE
16
17using namespace Qt::StringLiterals;
18
19// needed by the punycode encoder/decoder
20static const uint base = 36;
21static const uint tmin = 1;
22static const uint tmax = 26;
23static const uint skew = 38;
24static const uint damp = 700;
25static const uint initial_bias = 72;
26static const uint initial_n = 128;
27
28static constexpr qsizetype MaxDomainLabelLength = 63;
29
30static inline uint encodeDigit(uint digit)
31{
32 return digit + 22 + 75 * (digit < 26);
33}
34
35static inline uint adapt(uint delta, uint numpoints, bool firsttime)
36{
37 delta /= (firsttime ? damp : 2);
38 delta += (delta / numpoints);
39
40 uint k = 0;
41 for (; delta > ((base - tmin) * tmax) / 2; k += base)
42 delta /= (base - tmin);
43
44 return k + (((base - tmin + 1) * delta) / (delta + skew));
45}
46
47static inline void appendEncode(QString *output, uint delta, uint bias)
48{
49 uint qq;
50 uint k;
51 uint t;
52
53 // insert the variable length delta integer.
54 for (qq = delta, k = base;; k += base) {
55 // stop generating digits when the threshold is
56 // detected.
57 t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias;
58 if (qq < t) break;
59
60 *output += QChar(encodeDigit(digit: t + (qq - t) % (base - t)));
61 qq = (qq - t) / (base - t);
62 }
63
64 *output += QChar(encodeDigit(digit: qq));
65}
66
67Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output)
68{
69 uint n = initial_n;
70 uint delta = 0;
71 uint bias = initial_bias;
72
73 // Do not try to encode strings that certainly will result in output
74 // that is longer than allowable domain name label length. Note that
75 // non-BMP codepoints are encoded as two QChars.
76 if (in.size() > MaxDomainLabelLength * 2)
77 return;
78
79 int outLen = output->size();
80 output->resize(size: outLen + in.size());
81
82 QChar *d = output->data() + outLen;
83 bool skipped = false;
84 // copy all basic code points verbatim to output.
85 for (QChar c : in) {
86 if (c.unicode() < 0x80)
87 *d++ = c;
88 else
89 skipped = true;
90 }
91
92 // if there were only basic code points, just return them
93 // directly; don't do any encoding.
94 if (!skipped)
95 return;
96
97 output->truncate(pos: d - output->constData());
98 int copied = output->size() - outLen;
99
100 // h and b now contain the number of basic code points in input.
101 uint b = copied;
102 uint h = copied;
103
104 // if basic code points were copied, add the delimiter character.
105 if (h > 0)
106 *output += u'-';
107
108 // compute the input length in Unicode code points.
109 uint inputLength = 0;
110 for (QStringIterator iter(in); iter.hasNext();) {
111 inputLength++;
112
113 if (iter.next(invalidAs: char32_t(-1)) == char32_t(-1)) {
114 output->truncate(pos: outLen);
115 return; // invalid surrogate pair
116 }
117 }
118
119 // while there are still unprocessed non-basic code points left in
120 // the input string...
121 while (h < inputLength) {
122 // find the character in the input string with the lowest unprocessed value.
123 uint m = std::numeric_limits<uint>::max();
124 for (QStringIterator iter(in); iter.hasNext();) {
125 auto c = iter.nextUnchecked();
126 static_assert(std::numeric_limits<decltype(m)>::max()
127 >= std::numeric_limits<decltype(c)>::max(),
128 "Punycode uint should be able to cover all codepoints");
129 if (c >= n && c < m)
130 m = c;
131 }
132
133 // delta = delta + (m - n) * (h + 1), fail on overflow
134 uint tmp;
135 if (qMulOverflow<uint>(v1: m - n, v2: h + 1, r: &tmp) || qAddOverflow<uint>(v1: delta, v2: tmp, r: &delta)) {
136 output->truncate(pos: outLen);
137 return; // punycode_overflow
138 }
139 n = m;
140
141 for (QStringIterator iter(in); iter.hasNext();) {
142 auto c = iter.nextUnchecked();
143
144 // increase delta until we reach the character processed in this iteration;
145 // fail if delta overflows.
146 if (c < n) {
147 if (qAddOverflow<uint>(v1: delta, v2: 1, r: &delta)) {
148 output->truncate(pos: outLen);
149 return; // punycode_overflow
150 }
151 }
152
153 if (c == n) {
154 appendEncode(output, delta, bias);
155
156 bias = adapt(delta, numpoints: h + 1, firsttime: h == b);
157 delta = 0;
158 ++h;
159 }
160 }
161
162 ++delta;
163 ++n;
164 }
165
166 // prepend ACE prefix
167 output->insert(i: outLen, s: "xn--"_L1);
168 return;
169}
170
171Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc)
172{
173 uint n = initial_n;
174 uint i = 0;
175 uint bias = initial_bias;
176
177 // Do not try to decode strings longer than allowable for a domain label.
178 // Non-ASCII strings are not allowed here anyway, so there is no need
179 // to account for surrogates.
180 if (pc.size() > MaxDomainLabelLength)
181 return QString();
182
183 // strip any ACE prefix
184 int start = pc.startsWith(s: "xn--"_L1) ? 4 : 0;
185 if (!start)
186 return pc;
187
188 // find the last delimiter character '-' in the input array. copy
189 // all data before this delimiter directly to the output array.
190 int delimiterPos = pc.lastIndexOf(c: u'-');
191 auto output = delimiterPos < 4 ? std::u32string()
192 : pc.mid(position: start, n: delimiterPos - start).toStdU32String();
193
194 // if a delimiter was found, skip to the position after it;
195 // otherwise start at the front of the input string. everything
196 // before the delimiter is assumed to be basic code points.
197 uint cnt = delimiterPos + 1;
198
199 // loop through the rest of the input string, inserting non-basic
200 // characters into output as we go.
201 while (cnt < (uint) pc.size()) {
202 uint oldi = i;
203 uint w = 1;
204
205 // find the next index for inserting a non-basic character.
206 for (uint k = base; cnt < (uint) pc.size(); k += base) {
207 // grab a character from the punycode input and find its
208 // delta digit (each digit code is part of the
209 // variable-length integer delta)
210 uint digit = pc.at(i: cnt++).unicode();
211 if (digit - 48 < 10) digit -= 22;
212 else if (digit - 65 < 26) digit -= 65;
213 else if (digit - 97 < 26) digit -= 97;
214 else digit = base;
215
216 // Fail if the code point has no digit value
217 if (digit >= base)
218 return QString();
219
220 // i = i + digit * w, fail on overflow
221 uint tmp;
222 if (qMulOverflow<uint>(v1: digit, v2: w, r: &tmp) || qAddOverflow<uint>(v1: i, v2: tmp, r: &i))
223 return QString();
224
225 // detect threshold to stop reading delta digits
226 uint t;
227 if (k <= bias) t = tmin;
228 else if (k >= bias + tmax) t = tmax;
229 else t = k - bias;
230
231 if (digit < t) break;
232
233 // w = w * (base - t), fail on overflow
234 if (qMulOverflow<uint>(v1: w, v2: base - t, r: &w))
235 return QString();
236 }
237
238 // find new bias and calculate the next non-basic code
239 // character.
240 uint outputLength = static_cast<uint>(output.length());
241 bias = adapt(delta: i - oldi, numpoints: outputLength + 1, firsttime: oldi == 0);
242
243 // n = n + i div (length(output) + 1), fail on overflow
244 if (qAddOverflow<uint>(v1: n, v2: i / (outputLength + 1), r: &n))
245 return QString();
246
247 // allow the deltas to wrap around
248 i %= (outputLength + 1);
249
250 // if n is a basic code point then fail; this should not happen with
251 // correct implementation of Punycode, but check just n case.
252 if (n < initial_n) {
253 // Don't use Q_ASSERT() to avoid possibility of DoS
254 qWarning(msg: "Attempt to insert a basic codepoint. Unhandled overflow?");
255 return QString();
256 }
257
258 // Surrogates should normally be rejected later by other IDNA code.
259 // But because of Qt's use of UTF-16 to represent strings the
260 // IDNA code is not able to distinguish characters represented as pairs
261 // of surrogates from normal code points. This is why surrogates are
262 // not allowed here.
263 //
264 // Allowing surrogates would lead to non-unique (after normalization)
265 // encoding of strings with non-BMP characters.
266 //
267 // Punycode that encodes characters outside the Unicode range is also
268 // invalid and is rejected here.
269 if (QChar::isSurrogate(ucs4: n) || n > QChar::LastValidCodePoint)
270 return QString();
271
272 // insert the character n at position i
273 output.insert(pos: i, n: 1, c: static_cast<char32_t>(n));
274 ++i;
275 }
276
277 return QString::fromStdU32String(s: output);
278}
279
280static constexpr auto idn_whitelist = qOffsetStringArray(
281 strings: "ac", strings: "ar", strings: "asia", strings: "at",
282 strings: "biz", strings: "br",
283 strings: "cat", strings: "ch", strings: "cl", strings: "cn", strings: "com",
284 strings: "de", strings: "dk",
285 strings: "es",
286 strings: "fi",
287 strings: "gr",
288 strings: "hu",
289 strings: "il", strings: "info", strings: "io", strings: "is", strings: "ir",
290 strings: "jp",
291 strings: "kr",
292 strings: "li", strings: "lt", strings: "lu", strings: "lv",
293 strings: "museum",
294 strings: "name", strings: "net", strings: "no", strings: "nu", strings: "nz",
295 strings: "org",
296 strings: "pl", strings: "pr",
297 strings: "se", strings: "sh",
298 strings: "tel", strings: "th", strings: "tm", strings: "tw",
299 strings: "ua",
300 strings: "vn",
301 strings: "xn--fiqs8s", // China
302 strings: "xn--fiqz9s", // China
303 strings: "xn--fzc2c9e2c", // Sri Lanka
304 strings: "xn--j6w193g", // Hong Kong
305 strings: "xn--kprw13d", // Taiwan
306 strings: "xn--kpry57d", // Taiwan
307 strings: "xn--mgba3a4f16a", // Iran
308 strings: "xn--mgba3a4fra", // Iran
309 strings: "xn--mgbaam7a8h", // UAE
310 strings: "xn--mgbayh7gpa", // Jordan
311 strings: "xn--mgberp4a5d4ar", // Saudi Arabia
312 strings: "xn--ogbpf8fl", // Syria
313 strings: "xn--p1ai", // Russian Federation
314 strings: "xn--wgbh1c", // Egypt
315 strings: "xn--wgbl6a", // Qatar
316 strings: "xn--xkc2al3hye2a" // Sri Lanka
317);
318
319Q_CONSTINIT static QStringList *user_idn_whitelist = nullptr;
320
321static bool lessThan(const QChar *a, int l, const char *c)
322{
323 const auto *uc = reinterpret_cast<const char16_t *>(a);
324 const char16_t *e = uc + l;
325
326 if (!c || *c == 0)
327 return false;
328
329 while (*c) {
330 if (uc == e || *uc != static_cast<unsigned char>(*c))
331 break;
332 ++uc;
333 ++c;
334 }
335 return uc == e ? *c : (*uc < static_cast<unsigned char>(*c));
336}
337
338static bool equal(const QChar *a, int l, const char *b)
339{
340 while (l && a->unicode() && *b) {
341 if (*a != QLatin1Char(*b))
342 return false;
343 ++a;
344 ++b;
345 --l;
346 }
347 return l == 0;
348}
349
350static bool qt_is_idn_enabled(QStringView aceDomain)
351{
352 auto idx = aceDomain.lastIndexOf(c: u'.');
353 if (idx == -1)
354 return false;
355
356 auto tldString = aceDomain.mid(pos: idx + 1);
357 const auto len = tldString.size();
358
359 const QChar *tld = tldString.constData();
360
361 if (user_idn_whitelist)
362 return user_idn_whitelist->contains(str: tldString);
363
364 int l = 0;
365 int r = idn_whitelist.count() - 1;
366 int i = (l + r + 1) / 2;
367
368 while (r != l) {
369 if (lessThan(a: tld, l: len, c: idn_whitelist.at(index: i)))
370 r = i - 1;
371 else
372 l = i;
373 i = (l + r + 1) / 2;
374 }
375 return equal(a: tld, l: len, b: idn_whitelist.at(index: i));
376}
377
378template<typename C>
379static inline bool isValidInNormalizedAsciiLabel(C c)
380{
381 return c == u'-' || c == u'_' || (c >= u'0' && c <= u'9') || (c >= u'a' && c <= u'z');
382}
383
384template<typename C>
385static inline bool isValidInNormalizedAsciiName(C c)
386{
387 return isValidInNormalizedAsciiLabel(c) || c == u'.';
388}
389
390/*
391 Map domain name according to algorithm in UTS #46, 4.1
392
393 Returns empty string if there are disallowed characters in the input.
394
395 Sets resultIsAscii if the result is known for sure to be all ASCII.
396*/
397static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions options,
398 bool *resultIsAscii)
399{
400 *resultIsAscii = true;
401
402 // Check if the input is already normalized ASCII first and can be returned as is.
403 int i = 0;
404 for (auto c : in) {
405 if (c.unicode() >= 0x80 || !isValidInNormalizedAsciiName(c))
406 break;
407 i++;
408 }
409
410 if (i == in.size())
411 return in;
412
413 QString result;
414 result.reserve(asize: in.size());
415 result.append(uc: in.constData(), len: i);
416 bool allAscii = true;
417
418 for (QStringIterator iter(QStringView(in).sliced(pos: i)); iter.hasNext();) {
419 char32_t uc = iter.next();
420
421 // Fast path for ASCII-only inputs
422 if (Q_LIKELY(uc < 0x80)) {
423 if (uc >= U'A' && uc <= U'Z')
424 uc |= 0x20; // lower-case it
425
426 if (!isValidInNormalizedAsciiName(c: uc))
427 return {};
428
429 result.append(c: static_cast<char16_t>(uc));
430 continue;
431 }
432 allAscii = false;
433
434 QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(ucs4: uc);
435
436 if (status == QUnicodeTables::IdnaStatus::Deviation)
437 status = options.testFlag(flag: QUrl::AceTransitionalProcessing)
438 ? QUnicodeTables::IdnaStatus::Mapped
439 : QUnicodeTables::IdnaStatus::Valid;
440
441 switch (status) {
442 case QUnicodeTables::IdnaStatus::Ignored:
443 continue;
444 case QUnicodeTables::IdnaStatus::Valid:
445 for (auto c : QChar::fromUcs4(c: uc))
446 result.append(c);
447 break;
448 case QUnicodeTables::IdnaStatus::Mapped:
449 result.append(v: QUnicodeTables::idnaMapping(usc4: uc));
450 break;
451 case QUnicodeTables::IdnaStatus::Disallowed:
452 return {};
453 default:
454 Q_UNREACHABLE();
455 }
456 }
457
458 *resultIsAscii = allAscii;
459 return result;
460}
461
462/*
463 Check the rules for an ASCII label.
464
465 Check the size restriction and that the label does not start or end with dashes.
466
467 The label should be nonempty.
468*/
469static bool validateAsciiLabel(QStringView label)
470{
471 if (label.size() > MaxDomainLabelLength)
472 return false;
473
474 if (label.first() == u'-' || label.last() == u'-')
475 return false;
476
477 return std::all_of(first: label.begin(), last: label.end(), pred: isValidInNormalizedAsciiLabel<QChar>);
478}
479
480namespace {
481
482class DomainValidityChecker
483{
484 bool domainNameIsBidi = false;
485 bool hadBidiErrors = false;
486
487 static constexpr char32_t ZWNJ = U'\u200C';
488 static constexpr char32_t ZWJ = U'\u200D';
489
490public:
491 DomainValidityChecker() { }
492 bool checkLabel(const QString &label, QUrl::AceProcessingOptions options);
493
494private:
495 static bool checkContextJRules(QStringView label);
496 static bool checkBidiRules(QStringView label);
497};
498
499} // anonymous namespace
500
501/*
502 Check CONTEXTJ rules according to RFC 5892, appendix A.1 & A.2.
503
504 Rule Set for U+200C (ZWNJ):
505
506 False;
507
508 If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
509
510 If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
511
512 (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
513
514 Rule Set for U+200D (ZWJ):
515
516 False;
517
518 If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
519
520*/
521bool DomainValidityChecker::checkContextJRules(QStringView label)
522{
523 constexpr unsigned char CombiningClassVirama = 9;
524
525 enum class State {
526 Initial,
527 LD_T, // L,D with possible following T*
528 ZWNJ_T, // ZWNJ with possible following T*
529 };
530 State regexpState = State::Initial;
531 bool previousIsVirama = false;
532
533 for (QStringIterator iter(label); iter.hasNext();) {
534 auto ch = iter.next();
535
536 if (ch == ZWJ) {
537 if (!previousIsVirama)
538 return false;
539 regexpState = State::Initial;
540 } else if (ch == ZWNJ) {
541 if (!previousIsVirama && regexpState != State::LD_T)
542 return false;
543 regexpState = previousIsVirama ? State::Initial : State::ZWNJ_T;
544 } else {
545 switch (QChar::joiningType(ucs4: ch)) {
546 case QChar::Joining_Left:
547 if (regexpState == State::ZWNJ_T)
548 return false;
549 regexpState = State::LD_T;
550 break;
551 case QChar::Joining_Right:
552 regexpState = State::Initial;
553 break;
554 case QChar::Joining_Dual:
555 regexpState = State::LD_T;
556 break;
557 case QChar::Joining_Transparent:
558 break;
559 default:
560 regexpState = State::Initial;
561 break;
562 }
563 }
564
565 previousIsVirama = QChar::combiningClass(ucs4: ch) == CombiningClassVirama;
566 }
567
568 return regexpState != State::ZWNJ_T;
569}
570
571/*
572 Check if the label conforms to BiDi rule of RFC 5893.
573
574 1. The first character must be a character with Bidi property L, R,
575 or AL. If it has the R or AL property, it is an RTL label; if it
576 has the L property, it is an LTR label.
577
578 2. In an RTL label, only characters with the Bidi properties R, AL,
579 AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
580
581 3. In an RTL label, the end of the label must be a character with
582 Bidi property R, AL, EN, or AN, followed by zero or more
583 characters with Bidi property NSM.
584
585 4. In an RTL label, if an EN is present, no AN may be present, and
586 vice versa.
587
588 5. In an LTR label, only characters with the Bidi properties L, EN,
589 ES, CS, ET, ON, BN, or NSM are allowed.
590
591 6. In an LTR label, the end of the label must be a character with
592 Bidi property L or EN, followed by zero or more characters with
593 Bidi property NSM.
594*/
595bool DomainValidityChecker::checkBidiRules(QStringView label)
596{
597 if (label.isEmpty())
598 return true;
599
600 QStringIterator iter(label);
601 Q_ASSERT(iter.hasNext());
602
603 char32_t ch = iter.next();
604 bool labelIsRTL = false;
605
606 switch (QChar::direction(ucs4: ch)) {
607 case QChar::DirL:
608 break;
609 case QChar::DirR:
610 case QChar::DirAL:
611 labelIsRTL = true;
612 break;
613 default:
614 return false;
615 }
616
617 bool tailOk = true;
618 bool labelHasEN = false;
619 bool labelHasAN = false;
620
621 while (iter.hasNext()) {
622 ch = iter.next();
623
624 switch (QChar::direction(ucs4: ch)) {
625 case QChar::DirR:
626 case QChar::DirAL:
627 if (!labelIsRTL)
628 return false;
629 tailOk = true;
630 break;
631
632 case QChar::DirL:
633 if (labelIsRTL)
634 return false;
635 tailOk = true;
636 break;
637
638 case QChar::DirES:
639 case QChar::DirCS:
640 case QChar::DirET:
641 case QChar::DirON:
642 case QChar::DirBN:
643 tailOk = false;
644 break;
645
646 case QChar::DirNSM:
647 break;
648
649 case QChar::DirAN:
650 if (labelIsRTL) {
651 if (labelHasEN)
652 return false;
653 labelHasAN = true;
654 tailOk = true;
655 } else {
656 return false;
657 }
658 break;
659
660 case QChar::DirEN:
661 if (labelIsRTL) {
662 if (labelHasAN)
663 return false;
664 labelHasEN = true;
665 }
666 tailOk = true;
667 break;
668
669 default:
670 return false;
671 }
672 }
673
674 return tailOk;
675}
676
677/*
678 Check if the given label is valid according to UTS #46 validity criteria.
679
680 NFC check can be skipped if the label was transformed to NFC before calling
681 this function (as optimization).
682
683 The domain name is considered invalid if this function returns false at least
684 once.
685
686 1. The label must be in Unicode Normalization Form NFC.
687 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character
688 in both the third and fourth positions.
689 3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character.
690 4. The label must not contain a U+002E ( . ) FULL STOP.
691 5. The label must not begin with a combining mark, that is: General_Category=Mark.
692 6. Each code point in the label must only have certain status values according to Section 5,
693 IDNA Mapping Table:
694 1. For Transitional Processing, each value must be valid.
695 2. For Nontransitional Processing, each value must be either valid or deviation.
696 7. If CheckJoiners, the label must satisfy the ContextJ rules from Appendix A, in The Unicode
697 Code Points and Internationalized Domain Names for Applications (IDNA).
698 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy
699 all six of the numbered conditions in RFC 5893, Section 2.
700
701 NOTE: Don't use QStringView for label, so that call to QString::normalized() can avoid
702 memory allocation when there is nothing to normalize.
703*/
704bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessingOptions options)
705{
706 if (label.isEmpty())
707 return true;
708
709 if (label != label.normalized(mode: QString::NormalizationForm_C))
710 return false;
711
712 if (label.size() >= 4) {
713 // This assumes that the first two characters are in BMP, but that's ok
714 // because non-BMP characters are unlikely to be used for specifying
715 // future extensions.
716 if (label[2] == u'-' && label[3] == u'-')
717 return false;
718 }
719
720 if (label.startsWith(c: u'-') || label.endsWith(c: u'-'))
721 return false;
722
723 if (label.contains(c: u'.'))
724 return false;
725
726 QStringIterator iter(label);
727 auto c = iter.next();
728
729 if (QChar::isMark(ucs4: c))
730 return false;
731
732 // As optimization, CONTEXTJ rules check can be skipped if no
733 // ZWJ/ZWNJ characters were found during the first pass.
734 bool hasJoiners = false;
735
736 for (;;) {
737 hasJoiners = hasJoiners || c == ZWNJ || c == ZWJ;
738
739 if (!domainNameIsBidi) {
740 switch (QChar::direction(ucs4: c)) {
741 case QChar::DirR:
742 case QChar::DirAL:
743 case QChar::DirAN:
744 domainNameIsBidi = true;
745 if (hadBidiErrors)
746 return false;
747 break;
748 default:
749 break;
750 }
751 }
752
753 switch (QUnicodeTables::idnaStatus(ucs4: c)) {
754 case QUnicodeTables::IdnaStatus::Valid:
755 break;
756 case QUnicodeTables::IdnaStatus::Deviation:
757 if (options.testFlag(flag: QUrl::AceTransitionalProcessing))
758 return false;
759 break;
760 default:
761 return false;
762 }
763
764 if (!iter.hasNext())
765 break;
766 c = iter.next();
767 }
768
769 if (hasJoiners && !checkContextJRules(label))
770 return false;
771
772 hadBidiErrors = hadBidiErrors || !checkBidiRules(label);
773
774 if (domainNameIsBidi && hadBidiErrors)
775 return false;
776
777 return true;
778}
779
780static QString convertToAscii(const QString &normalizedDomain, AceLeadingDot dot)
781{
782 qsizetype lastIdx = 0;
783 QString aceForm; // this variable is here for caching
784 QString aceResult;
785
786 while (true) {
787 auto idx = normalizedDomain.indexOf(c: u'.', from: lastIdx);
788 if (idx == -1)
789 idx = normalizedDomain.size();
790
791 const auto labelLength = idx - lastIdx;
792 if (labelLength == 0) {
793 if (idx == normalizedDomain.size())
794 break;
795 if (dot == ForbidLeadingDot || idx > 0)
796 return {}; // two delimiters in a row -- empty label not allowed
797 } else {
798 const auto label = QStringView(normalizedDomain).sliced(pos: lastIdx, n: labelLength);
799 aceForm.clear();
800 qt_punycodeEncoder(in: label, output: &aceForm);
801 if (aceForm.isEmpty())
802 return {};
803
804 aceResult.append(s: aceForm);
805 }
806
807 if (idx == normalizedDomain.size())
808 break;
809
810 lastIdx = idx + 1;
811 aceResult += u'.';
812 }
813
814 return aceResult;
815}
816
817static bool checkAsciiDomainName(const QString &normalizedDomain, AceLeadingDot dot,
818 bool *usesPunycode)
819{
820 qsizetype lastIdx = 0;
821 bool hasPunycode = false;
822 *usesPunycode = false;
823
824 while (lastIdx < normalizedDomain.size()) {
825 auto idx = normalizedDomain.indexOf(c: u'.', from: lastIdx);
826 if (idx == -1)
827 idx = normalizedDomain.size();
828
829 const auto labelLength = idx - lastIdx;
830 if (labelLength == 0) {
831 if (idx == normalizedDomain.size())
832 break;
833 if (dot == ForbidLeadingDot || idx > 0)
834 return false; // two delimiters in a row -- empty label not allowed
835 } else {
836 const auto label = QStringView(normalizedDomain).sliced(pos: lastIdx, n: labelLength);
837 if (!validateAsciiLabel(label))
838 return false;
839
840 hasPunycode = hasPunycode || label.startsWith(s: "xn--"_L1);
841 }
842
843 lastIdx = idx + 1;
844 }
845
846 *usesPunycode = hasPunycode;
847 return true;
848}
849
850static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingOptions options)
851{
852 QString result;
853 result.reserve(asize: asciiDomain.size());
854 qsizetype lastIdx = 0;
855
856 DomainValidityChecker checker;
857
858 while (true) {
859 auto idx = asciiDomain.indexOf(c: u'.', from: lastIdx);
860 if (idx == -1)
861 idx = asciiDomain.size();
862
863 const auto labelLength = idx - lastIdx;
864 if (labelLength == 0) {
865 if (idx == asciiDomain.size())
866 break;
867 } else {
868 const auto label = asciiDomain.sliced(pos: lastIdx, n: labelLength);
869 const auto unicodeLabel = qt_punycodeDecoder(pc: label);
870
871 if (unicodeLabel.isEmpty())
872 return asciiDomain;
873
874 if (!checker.checkLabel(label: unicodeLabel, options))
875 return asciiDomain;
876
877 result.append(s: unicodeLabel);
878 }
879
880 if (idx == asciiDomain.size())
881 break;
882
883 lastIdx = idx + 1;
884 result += u'.';
885 }
886 return result;
887}
888
889QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot,
890 QUrl::AceProcessingOptions options)
891{
892 if (domain.isEmpty())
893 return {};
894
895 bool mappedToAscii;
896 const QString mapped = mapDomainName(in: domain, options, resultIsAscii: &mappedToAscii);
897 const QString normalized =
898 mappedToAscii ? mapped : mapped.normalized(mode: QString::NormalizationForm_C);
899
900 if (normalized.isEmpty())
901 return {};
902
903 bool needsCoversionToUnicode;
904 const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalizedDomain: normalized, dot);
905 if (aceResult.isEmpty() || !checkAsciiDomainName(normalizedDomain: aceResult, dot, usesPunycode: &needsCoversionToUnicode))
906 return {};
907
908 if (op == ToAceOnly || !needsCoversionToUnicode
909 || (!options.testFlag(flag: QUrl::IgnoreIDNWhitelist) && !qt_is_idn_enabled(aceDomain: aceResult))) {
910 return aceResult;
911 }
912
913 return convertToUnicode(asciiDomain: aceResult, options);
914}
915
916/*!
917 \since 4.2
918
919 Returns the current whitelist of top-level domains that are allowed
920 to have non-ASCII characters in their compositions.
921
922 See setIdnWhitelist() for the rationale of this list.
923
924 \sa AceProcessingOption
925*/
926QStringList QUrl::idnWhitelist()
927{
928 if (user_idn_whitelist)
929 return *user_idn_whitelist;
930 static const QStringList list = [] {
931 QStringList list;
932 list.reserve(asize: idn_whitelist.count());
933 int i = 0;
934 while (i < idn_whitelist.count()) {
935 list << QLatin1StringView(idn_whitelist.at(index: i));
936 ++i;
937 }
938 return list;
939 }();
940 return list;
941}
942
943/*!
944 \since 4.2
945
946 Sets the whitelist of Top-Level Domains (TLDs) that are allowed to have
947 non-ASCII characters in domains to the value of \a list.
948
949 Note that if you call this function, you need to do so \e before
950 you start any threads that might access idnWhitelist().
951
952 Qt comes with a default list that contains the Internet top-level domains
953 that have published support for Internationalized Domain Names (IDNs)
954 and rules to guarantee that no deception can happen between similarly-looking
955 characters (such as the Latin lowercase letter \c 'a' and the Cyrillic
956 equivalent, which in most fonts are visually identical).
957
958 This list is periodically maintained, as registrars publish new rules.
959
960 This function is provided for those who need to manipulate the list, in
961 order to add or remove a TLD. It is not recommended to change its value
962 for purposes other than testing, as it may expose users to security risks.
963*/
964void QUrl::setIdnWhitelist(const QStringList &list)
965{
966 if (!user_idn_whitelist)
967 user_idn_whitelist = new QStringList;
968 *user_idn_whitelist = list;
969}
970
971QT_END_NAMESPACE
972

source code of qtbase/src/corelib/io/qurlidna.cpp