1/*
2 * Copyright (C) 2004, 2007, 2008, 2011, 2012, 2013, 2015-2016 Apple Inc. All rights reserved.
3 * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "URL.h"
29
30#include "DecodeEscapeSequences.h"
31#include "MIMETypeRegistry.h"
32#include "TextEncoding.h"
33#include "UUID.h"
34#include <stdio.h>
35#include <unicode/uidna.h>
36#include <wtf/HashMap.h>
37#include <wtf/HexNumber.h>
38#include <wtf/NeverDestroyed.h>
39#include <wtf/StdLibExtras.h>
40#include <wtf/text/CString.h>
41#include <wtf/text/StringBuilder.h>
42#include <wtf/text/StringHash.h>
43
44// FIXME: This file makes too much use of the + operator on String.
45// We either have to optimize that operator so it doesn't involve
46// so many allocations, or change this to use StringBuffer instead.
47
48using namespace WTF;
49
50namespace WebCore {
51
52typedef Vector<char, 512> CharBuffer;
53typedef Vector<UChar, 512> UCharBuffer;
54
55static const unsigned maximumValidPortNumber = 0xFFFE;
56static const unsigned invalidPortNumber = 0xFFFF;
57
58static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter)
59{
60 ASSERT(isASCIILower(lowercaseLetter));
61 return (character | 0x20) == lowercaseLetter;
62}
63
64static const char wsScheme[] = {'w', 's'};
65static const char ftpScheme[] = {'f', 't', 'p'};
66static const char ftpPort[] = {'2', '1'};
67static const char wssScheme[] = {'w', 's', 's'};
68static const char fileScheme[] = {'f', 'i', 'l', 'e'};
69static const char httpScheme[] = {'h', 't', 't', 'p'};
70static const char httpPort[] = {'8', '0'};
71static const char httpsScheme[] = {'h', 't', 't', 'p', 's'};
72static const char httpsPort[] = {'4', '4', '3'};
73static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'};
74static const char gopherPort[] = {'7', '0'};
75
76static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter)
77{
78 ASSERT(isASCIILower(lowercaseLetter));
79 return (character | 0x20) == lowercaseLetter;
80}
81
82enum URLCharacterClasses {
83 // alpha
84 SchemeFirstChar = 1 << 0,
85
86 // ( alpha | digit | "+" | "-" | "." )
87 SchemeChar = 1 << 1,
88
89 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
90 // unreserved = alphanum | mark
91 // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
92 UserInfoChar = 1 << 2,
93
94 // alnum | "." | "-" | "%"
95 // The above is what the specification says, but we are lenient to
96 // match existing practice and also allow:
97 // "_"
98 HostnameChar = 1 << 3,
99
100 // hexdigit | ":" | "%"
101 IPv6Char = 1 << 4,
102
103 // "#" | "?" | "/" | nul
104 PathSegmentEndChar = 1 << 5,
105
106 // not allowed in path
107 BadChar = 1 << 6,
108
109 // "\t" | "\n" | "\r"
110 TabNewline = 1 << 7
111};
112
113static const unsigned char characterClassTable[256] = {
114 /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar,
115 /* 2 stx */ BadChar, /* 3 etx */ BadChar,
116 /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar,
117 /* 8 bs */ BadChar, /* 9 ht */ BadChar | TabNewline, /* 10 nl */ BadChar | TabNewline,
118 /* 11 vt */ BadChar, /* 12 np */ BadChar, /* 13 cr */ BadChar | TabNewline,
119 /* 14 so */ BadChar, /* 15 si */ BadChar,
120 /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar,
121 /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar,
122 /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar,
123 /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar,
124 /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar,
125 /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar,
126 /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
127 /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar,
128 /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar,
129 /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar,
130 /* 44 , */ UserInfoChar,
131 /* 45 - */ SchemeChar | UserInfoChar | HostnameChar,
132 /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
133 /* 47 / */ PathSegmentEndChar,
134 /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
135 /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
136 /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
137 /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
138 /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
139 /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
140 /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
141 /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
142 /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
143 /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
144 /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar,
145 /* 60 < */ BadChar, /* 61 = */ UserInfoChar,
146 /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar,
147 /* 64 @ */ 0,
148 /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
149 /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
150 /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
151 /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
152 /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
153 /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
154 /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
155 /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
156 /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
157 /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
158 /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
159 /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
160 /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
161 /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
162 /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
163 /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
164 /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
165 /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
166 /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
167 /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
168 /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
169 /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
170 /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
171 /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
172 /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
173 /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
174 /* 91 [ */ 0,
175 /* 92 \ */ 0, /* 93 ] */ 0,
176 /* 94 ^ */ 0,
177 /* 95 _ */ UserInfoChar | HostnameChar,
178 /* 96 ` */ 0,
179 /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
180 /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
181 /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
182 /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
183 /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
184 /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
185 /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
186 /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
187 /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
188 /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
189 /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
190 /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
191 /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
192 /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
193 /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
194 /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
195 /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
196 /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
197 /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
198 /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
199 /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
200 /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
201 /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
202 /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
203 /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
204 /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
205 /* 123 { */ 0,
206 /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar,
207 /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
208 /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
209 /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
210 /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
211 /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
212 /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
213 /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
214 /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
215 /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
216 /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
217 /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
218 /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
219 /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
220 /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
221 /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
222 /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
223 /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
224 /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
225 /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
226 /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
227 /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
228 /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
229 /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
230 /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
231 /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
232 /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
233 /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
234 /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
235 /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
236 /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
237 /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
238 /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
239};
240
241enum PercentEncodeCharacterClass {
242 // Class names match the URL Standard; each class is a superset of the previous one.
243 PercentEncodeSimple = 255,
244 PercentEncodeDefault = 127,
245 PercentEncodePassword = 63,
246 PercentEncodeUsername = 31,
247};
248
249static const unsigned char percentEncodeClassTable[256] = {
250 /* 0 nul */ PercentEncodeSimple, /* 1 soh */ PercentEncodeSimple, /* 2 stx */ PercentEncodeSimple, /* 3 etx */ PercentEncodeSimple,
251 /* 4 eot */ PercentEncodeSimple, /* 5 enq */ PercentEncodeSimple, /* 6 ack */ PercentEncodeSimple, /* 7 bel */ PercentEncodeSimple,
252 /* 8 bs */ PercentEncodeSimple, /* 9 ht */ PercentEncodeSimple, /* 10 nl */ PercentEncodeSimple, /* 11 vt */ PercentEncodeSimple,
253 /* 12 np */ PercentEncodeSimple, /* 13 cr */ PercentEncodeSimple, /* 14 so */ PercentEncodeSimple, /* 15 si */ PercentEncodeSimple,
254 /* 16 dle */ PercentEncodeSimple, /* 17 dc1 */ PercentEncodeSimple, /* 18 dc2 */ PercentEncodeSimple, /* 19 dc3 */ PercentEncodeSimple,
255 /* 20 dc4 */ PercentEncodeSimple, /* 21 nak */ PercentEncodeSimple, /* 22 syn */ PercentEncodeSimple, /* 23 etb */ PercentEncodeSimple,
256 /* 24 can */ PercentEncodeSimple, /* 25 em */ PercentEncodeSimple, /* 26 sub */ PercentEncodeSimple, /* 27 esc */ PercentEncodeSimple,
257 /* 28 fs */ PercentEncodeSimple, /* 29 gs */ PercentEncodeSimple, /* 30 rs */ PercentEncodeSimple, /* 31 us */ PercentEncodeSimple,
258 /* 32 sp */ PercentEncodeDefault,
259 /* 33 ! */ 0,
260 /* 34 " */ PercentEncodeDefault,
261 /* 35 # */ PercentEncodeDefault,
262 /* 36 $ */ 0,
263 /* 37 % */ 0,
264 /* 38 & */ 0,
265 /* 39 ' */ 0,
266 /* 40 ( */ 0,
267 /* 41 ) */ 0,
268 /* 42 * */ 0,
269 /* 43 + */ 0,
270 /* 44 , */ 0,
271 /* 45 - */ 0,
272 /* 46 . */ 0,
273 /* 47 / */ PercentEncodePassword,
274 /* 48 0 */ 0, /* 49 1 */ 0, /* 50 2 */ 0, /* 51 3 */ 0,
275 /* 52 4 */ 0, /* 53 5 */ 0, /* 54 6 */ 0, /* 55 7 */ 0,
276 /* 56 8 */ 0, /* 57 9 */ 0,
277 /* 58 : */ PercentEncodeUsername,
278 /* 59 ; */ 0,
279 /* 60 < */ PercentEncodeDefault,
280 /* 61 = */ 0,
281 /* 62 > */ PercentEncodeDefault,
282 /* 63 ? */ PercentEncodeDefault,
283 /* 64 @ */ PercentEncodePassword,
284 /* 65 A */ 0, /* 66 B */ 0, /* 67 C */ 0, /* 68 D */ 0,
285 /* 69 E */ 0, /* 70 F */ 0, /* 71 G */ 0, /* 72 H */ 0,
286 /* 73 I */ 0, /* 74 J */ 0, /* 75 K */ 0, /* 76 L */ 0,
287 /* 77 M */ 0, /* 78 N */ 0, /* 79 O */ 0, /* 80 P */ 0,
288 /* 81 Q */ 0, /* 82 R */ 0, /* 83 S */ 0, /* 84 T */ 0,
289 /* 85 U */ 0, /* 86 V */ 0, /* 87 W */ 0, /* 88 X */ 0,
290 /* 89 Y */ 0, /* 90 Z */ 0,
291 /* 91 [ */ 0,
292 /* 92 \ */ PercentEncodePassword,
293 /* 93 ] */ 0,
294 /* 94 ^ */ 0,
295 /* 95 _ */ 0,
296 /* 96 ` */ PercentEncodeDefault,
297 /* 97 a */ 0, /* 98 b */ 0, /* 99 c */ 0, /* 100 d */ 0,
298 /* 101 e */ 0, /* 102 f */ 0, /* 103 g */ 0, /* 104 h */ 0,
299 /* 105 i */ 0, /* 106 j */ 0, /* 107 k */ 0, /* 108 l */ 0,
300 /* 109 m */ 0, /* 110 n */ 0, /* 111 o */ 0, /* 112 p */ 0,
301 /* 113 q */ 0, /* 114 r */ 0, /* 115 s */ 0, /* 116 t */ 0,
302 /* 117 u */ 0, /* 118 v */ 0, /* 119 w */ 0, /* 120 x */ 0,
303 /* 121 y */ 0, /* 122 z */ 0,
304 /* 123 { */ 0,
305 /* 124 | */ 0,
306 /* 125 } */ 0,
307 /* 126 ~ */ 0,
308 /* 127 del */ PercentEncodeSimple,
309 /* 128 */ PercentEncodeSimple, /* 129 */ PercentEncodeSimple, /* 130 */ PercentEncodeSimple, /* 131 */ PercentEncodeSimple,
310 /* 132 */ PercentEncodeSimple, /* 133 */ PercentEncodeSimple, /* 134 */ PercentEncodeSimple, /* 135 */ PercentEncodeSimple,
311 /* 136 */ PercentEncodeSimple, /* 137 */ PercentEncodeSimple, /* 138 */ PercentEncodeSimple, /* 139 */ PercentEncodeSimple,
312 /* 140 */ PercentEncodeSimple, /* 141 */ PercentEncodeSimple, /* 142 */ PercentEncodeSimple, /* 143 */ PercentEncodeSimple,
313 /* 144 */ PercentEncodeSimple, /* 145 */ PercentEncodeSimple, /* 146 */ PercentEncodeSimple, /* 147 */ PercentEncodeSimple,
314 /* 148 */ PercentEncodeSimple, /* 149 */ PercentEncodeSimple, /* 150 */ PercentEncodeSimple, /* 151 */ PercentEncodeSimple,
315 /* 152 */ PercentEncodeSimple, /* 153 */ PercentEncodeSimple, /* 154 */ PercentEncodeSimple, /* 155 */ PercentEncodeSimple,
316 /* 156 */ PercentEncodeSimple, /* 157 */ PercentEncodeSimple, /* 158 */ PercentEncodeSimple, /* 159 */ PercentEncodeSimple,
317 /* 160 */ PercentEncodeSimple, /* 161 */ PercentEncodeSimple, /* 162 */ PercentEncodeSimple, /* 163 */ PercentEncodeSimple,
318 /* 164 */ PercentEncodeSimple, /* 165 */ PercentEncodeSimple, /* 166 */ PercentEncodeSimple, /* 167 */ PercentEncodeSimple,
319 /* 168 */ PercentEncodeSimple, /* 169 */ PercentEncodeSimple, /* 170 */ PercentEncodeSimple, /* 171 */ PercentEncodeSimple,
320 /* 172 */ PercentEncodeSimple, /* 173 */ PercentEncodeSimple, /* 174 */ PercentEncodeSimple, /* 175 */ PercentEncodeSimple,
321 /* 176 */ PercentEncodeSimple, /* 177 */ PercentEncodeSimple, /* 178 */ PercentEncodeSimple, /* 179 */ PercentEncodeSimple,
322 /* 180 */ PercentEncodeSimple, /* 181 */ PercentEncodeSimple, /* 182 */ PercentEncodeSimple, /* 183 */ PercentEncodeSimple,
323 /* 184 */ PercentEncodeSimple, /* 185 */ PercentEncodeSimple, /* 186 */ PercentEncodeSimple, /* 187 */ PercentEncodeSimple,
324 /* 188 */ PercentEncodeSimple, /* 189 */ PercentEncodeSimple, /* 190 */ PercentEncodeSimple, /* 191 */ PercentEncodeSimple,
325 /* 192 */ PercentEncodeSimple, /* 193 */ PercentEncodeSimple, /* 194 */ PercentEncodeSimple, /* 195 */ PercentEncodeSimple,
326 /* 196 */ PercentEncodeSimple, /* 197 */ PercentEncodeSimple, /* 198 */ PercentEncodeSimple, /* 199 */ PercentEncodeSimple,
327 /* 200 */ PercentEncodeSimple, /* 201 */ PercentEncodeSimple, /* 202 */ PercentEncodeSimple, /* 203 */ PercentEncodeSimple,
328 /* 204 */ PercentEncodeSimple, /* 205 */ PercentEncodeSimple, /* 206 */ PercentEncodeSimple, /* 207 */ PercentEncodeSimple,
329 /* 208 */ PercentEncodeSimple, /* 209 */ PercentEncodeSimple, /* 210 */ PercentEncodeSimple, /* 211 */ PercentEncodeSimple,
330 /* 212 */ PercentEncodeSimple, /* 213 */ PercentEncodeSimple, /* 214 */ PercentEncodeSimple, /* 215 */ PercentEncodeSimple,
331 /* 216 */ PercentEncodeSimple, /* 217 */ PercentEncodeSimple, /* 218 */ PercentEncodeSimple, /* 219 */ PercentEncodeSimple,
332 /* 220 */ PercentEncodeSimple, /* 221 */ PercentEncodeSimple, /* 222 */ PercentEncodeSimple, /* 223 */ PercentEncodeSimple,
333 /* 224 */ PercentEncodeSimple, /* 225 */ PercentEncodeSimple, /* 226 */ PercentEncodeSimple, /* 227 */ PercentEncodeSimple,
334 /* 228 */ PercentEncodeSimple, /* 229 */ PercentEncodeSimple, /* 230 */ PercentEncodeSimple, /* 231 */ PercentEncodeSimple,
335 /* 232 */ PercentEncodeSimple, /* 233 */ PercentEncodeSimple, /* 234 */ PercentEncodeSimple, /* 235 */ PercentEncodeSimple,
336 /* 236 */ PercentEncodeSimple, /* 237 */ PercentEncodeSimple, /* 238 */ PercentEncodeSimple, /* 239 */ PercentEncodeSimple,
337 /* 240 */ PercentEncodeSimple, /* 241 */ PercentEncodeSimple, /* 242 */ PercentEncodeSimple, /* 243 */ PercentEncodeSimple,
338 /* 244 */ PercentEncodeSimple, /* 245 */ PercentEncodeSimple, /* 246 */ PercentEncodeSimple, /* 247 */ PercentEncodeSimple,
339 /* 248 */ PercentEncodeSimple, /* 249 */ PercentEncodeSimple, /* 250 */ PercentEncodeSimple, /* 251 */ PercentEncodeSimple,
340 /* 252 */ PercentEncodeSimple, /* 253 */ PercentEncodeSimple, /* 254 */ PercentEncodeSimple, /* 255 */ PercentEncodeSimple
341};
342
343static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd);
344static bool encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput);
345static String substituteBackslashes(const String&);
346
347static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; }
348static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
349static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; }
350static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
351static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; }
352static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; }
353static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; }
354static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; }
355static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); }
356static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
357static inline bool isTabNewline(UChar c) { return c <= 0xff && (characterClassTable[c] & TabNewline); }
358
359static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter)
360{
361 ASSERT(isSchemeChar(character));
362 ASSERT(schemeCharacter & 0x20);
363 ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter)));
364 return (character | 0x20) == schemeCharacter;
365}
366
367String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode);
368
369// Copies the source to the destination, assuming all the source characters are
370// ASCII. The destination buffer must be large enough. Null characters are allowed
371// in the source string, and no attempt is made to null-terminate the result.
372static void copyASCII(const String& string, char* dest)
373{
374 if (string.isEmpty())
375 return;
376
377 if (string.is8Bit())
378 memcpy(dest, string.characters8(), string.length());
379 else {
380 const UChar* src = string.characters16();
381 size_t length = string.length();
382 for (size_t i = 0; i < length; i++)
383 dest[i] = static_cast<char>(src[i]);
384 }
385}
386
387static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer)
388{
389 buffer.resize(base.length() + len + 1);
390 copyASCII(base, buffer.data());
391 memcpy(buffer.data() + base.length(), rel, len);
392 buffer[buffer.size() - 1] = '\0';
393}
394
395// FIXME: Move to WTFString.h eventually.
396// Returns the index of the first index in string |s| of any of the characters
397// in |toFind|. |toFind| should be a null-terminated string, all characters up
398// to the null will be searched. Returns int if not found.
399static int findFirstOf(StringView string, unsigned startPosition, const char* target)
400{
401 unsigned length = string.length();
402 for (unsigned i = startPosition; i < length; ++i) {
403 for (unsigned j = 0; target[j]; ++j) {
404 if (string[i] == target[j])
405 return i;
406 }
407 }
408 return -1;
409}
410
411static inline void checkEncodedString(const String& url)
412{
413 ASSERT_UNUSED(url, url.containsOnlyASCII());
414 ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0]));
415}
416
417inline bool URL::protocolIs(const String& string, const char* protocol)
418{
419 return WebCore::protocolIs(string, protocol);
420}
421
422void URL::invalidate()
423{
424 m_isValid = false;
425 m_protocolIsInHTTPFamily = false;
426 m_schemeEnd = 0;
427 m_userStart = 0;
428 m_userEnd = 0;
429 m_passwordEnd = 0;
430 m_hostEnd = 0;
431 m_portEnd = 0;
432 m_pathEnd = 0;
433 m_pathAfterLastSlash = 0;
434 m_queryEnd = 0;
435 m_fragmentEnd = 0;
436}
437
438URL::URL(ParsedURLStringTag, const String& url)
439{
440 parse(url);
441#if OS(WINDOWS)
442 // FIXME(148598): Work around Windows local file handling bug in CFNetwork
443 ASSERT(isLocalFile() || url == m_string);
444#else
445 ASSERT(url == m_string);
446#endif
447}
448
449URL::URL(const URL& base, const String& relative)
450{
451 init(base, relative, UTF8Encoding());
452}
453
454URL::URL(const URL& base, const String& relative, const TextEncoding& encoding)
455{
456 // For UTF-{7,16,32}, we want to use UTF-8 for the query part as
457 // we do when submitting a form. A form with GET method
458 // has its contents added to a URL as query params and it makes sense
459 // to be consistent.
460 init(base, relative, encoding.encodingForFormSubmission());
461}
462
463static bool shouldTrimFromURL(UChar c)
464{
465 // Browsers ignore leading/trailing whitespace and control
466 // characters from URLs. Note that c is an *unsigned* char here
467 // so this comparison should only catch control characters.
468 return c <= ' ';
469}
470
471void URL::init(const URL& base, const String& relative, const TextEncoding& encoding)
472{
473 // Allow resolutions with a null or empty base URL, but not with any other invalid one.
474 // FIXME: Is this a good rule?
475 if (!base.m_isValid && !base.isEmpty()) {
476 m_string = relative;
477 invalidate();
478 return;
479 }
480
481 // Get rid of leading and trailing whitespace and control characters.
482 String rel = relative.stripWhiteSpace(shouldTrimFromURL);
483
484 // Get rid of any tabs and newlines.
485 rel = rel.removeCharacters(isTabNewline);
486
487 // For compatibility with Win IE, treat backslashes as if they were slashes,
488 // as long as we're not dealing with javascript: or data: URLs.
489 if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data")))
490 rel = substituteBackslashes(rel);
491
492 bool allASCII = rel.containsOnlyASCII();
493 CharBuffer strBuffer;
494 char* str;
495 size_t len;
496 if (allASCII) {
497 len = rel.length();
498 strBuffer.resize(len + 1);
499 copyASCII(rel, strBuffer.data());
500 strBuffer[len] = 0;
501 str = strBuffer.data();
502 } else {
503 if (!encodeRelativeString(rel, encoding, strBuffer)) {
504 m_string = blankURL();
505 invalidate();
506 return;
507 }
508
509 str = strBuffer.data();
510 len = strlen(str);
511 }
512
513 // According to the RFC, the reference should be interpreted as an
514 // absolute URI if possible, using the "leftmost, longest"
515 // algorithm. If the URI reference is absolute it will have a
516 // scheme, meaning that it will have a colon before the first
517 // non-scheme element.
518 bool absolute = false;
519 char* p = str;
520 if (isSchemeFirstChar(*p)) {
521 ++p;
522 while (isSchemeChar(*p)) {
523 ++p;
524 }
525 if (*p == ':') {
526 if (p[1] != '/' && equalIgnoringASCIICase(base.protocol(), StringView(reinterpret_cast<LChar*>(str), p - str)) && base.isHierarchical())
527 str = p + 1;
528 else
529 absolute = true;
530 }
531 }
532
533 CharBuffer parseBuffer;
534
535 if (absolute) {
536 parse(str, &relative);
537 } else {
538 // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid
539 // unless the relative URL is a single fragment.
540 if (!base.isHierarchical()) {
541 if (str[0] == '#') {
542 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
543 parse(parseBuffer.data(), &relative);
544 } else {
545 m_string = relative;
546 invalidate();
547 }
548 return;
549 }
550
551 switch (str[0]) {
552 case '\0':
553 // The reference is empty, so this is a reference to the same document with any fragment identifier removed.
554 *this = base;
555 removeFragmentIdentifier();
556 break;
557 case '#': {
558 // must be fragment-only reference
559 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
560 parse(parseBuffer.data(), &relative);
561 break;
562 }
563 case '?': {
564 // query-only reference, special case needed for non-URL results
565 appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer);
566 parse(parseBuffer.data(), &relative);
567 break;
568 }
569 case '/':
570 // must be net-path or absolute-path reference
571 if (str[1] == '/') {
572 // net-path
573 appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer);
574 parse(parseBuffer.data(), &relative);
575 } else {
576 // abs-path
577 appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer);
578 parse(parseBuffer.data(), &relative);
579 }
580 break;
581 default:
582 {
583 // must be relative-path reference
584
585 // Base part plus relative part plus one possible slash added in between plus terminating \0 byte.
586 const size_t bufferSize = base.m_pathEnd + 1 + len + 1;
587 parseBuffer.resize(bufferSize);
588
589 char* bufferPos = parseBuffer.data();
590 char* bufferStart = bufferPos;
591
592 // first copy everything before the path from the base
593 CharBuffer baseStringBuffer(base.m_string.length());
594 copyASCII(base.m_string, baseStringBuffer.data());
595 const char* baseString = baseStringBuffer.data();
596 const char* baseStringStart = baseString;
597 const char* pathStart = baseStringStart + base.m_portEnd;
598 while (baseStringStart < pathStart)
599 *bufferPos++ = *baseStringStart++;
600 char* bufferPathStart = bufferPos;
601
602 // now copy the base path
603 const char* baseStringEnd = baseString + base.m_pathEnd;
604
605 // go back to the last slash
606 while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/')
607 baseStringEnd--;
608
609 if (baseStringEnd == baseStringStart) {
610 // no path in base, add a path separator if necessary
611 if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#')
612 *bufferPos++ = '/';
613 } else {
614 bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart);
615 }
616
617 const char* relStringStart = str;
618 const char* relStringPos = relStringStart;
619
620 while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') {
621 if (relStringPos[0] == '.' && bufferPos[-1] == '/') {
622 if (isPathSegmentEndChar(relStringPos[1])) {
623 // skip over "." segment
624 relStringPos += 1;
625 if (relStringPos[0] == '/')
626 relStringPos++;
627 continue;
628 } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) {
629 // skip over ".." segment and rewind the last segment
630 // the RFC leaves it up to the app to decide what to do with excess
631 // ".." segments - we choose to drop them since some web content
632 // relies on this.
633 relStringPos += 2;
634 if (relStringPos[0] == '/')
635 relStringPos++;
636 if (bufferPos > bufferPathStart + 1)
637 bufferPos--;
638 while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/')
639 bufferPos--;
640 continue;
641 }
642 }
643
644 *bufferPos = *relStringPos;
645 relStringPos++;
646 bufferPos++;
647 }
648
649 // all done with the path work, now copy any remainder
650 // of the relative reference; this will also add a null terminator
651 strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart));
652
653 parse(parseBuffer.data(), &relative);
654
655 ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size());
656 break;
657 }
658 }
659 }
660}
661
662URL URL::isolatedCopy() const
663{
664 URL result = *this;
665 result.m_string = result.m_string.isolatedCopy();
666 return result;
667}
668
669String URL::lastPathComponent() const
670{
671 if (!hasPath())
672 return String();
673
674 unsigned end = m_pathEnd - 1;
675 if (m_string[end] == '/')
676 --end;
677
678 size_t start = m_string.reverseFind('/', end);
679 if (start < static_cast<unsigned>(m_portEnd))
680 return String();
681 ++start;
682
683 return m_string.substring(start, end - start + 1);
684}
685
686String URL::protocol() const
687{
688 return m_string.left(m_schemeEnd);
689}
690
691String URL::host() const
692{
693 int start = hostStart();
694 return m_string.substring(start, m_hostEnd - start);
695}
696
697unsigned short URL::port() const
698{
699 // We return a port of 0 if there is no port specified. This can happen in two situations:
700 // 1) The URL contains no colon after the host name and before the path component of the URL.
701 // 2) The URL contains a colon but there's no port number before the path component of the URL begins.
702 if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1)
703 return 0;
704
705 bool ok = false;
706 unsigned number;
707 if (m_string.is8Bit())
708 number = charactersToUIntStrict(m_string.characters8() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
709 else
710 number = charactersToUIntStrict(m_string.characters16() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
711 if (!ok || number > maximumValidPortNumber)
712 return invalidPortNumber;
713 return number;
714}
715
716String URL::user() const
717{
718 return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
719}
720
721String URL::pass() const
722{
723 if (m_passwordEnd == m_userEnd)
724 return String();
725
726 return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
727}
728
729String URL::encodedUser() const
730{
731 return m_string.substring(m_userStart, m_userEnd - m_userStart);
732}
733
734String URL::encodedPass() const
735{
736 if (m_passwordEnd == m_userEnd)
737 return String();
738
739 return m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1);
740}
741
742String URL::fragmentIdentifier() const
743{
744 if (m_fragmentEnd == m_queryEnd)
745 return String();
746
747 return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
748}
749
750bool URL::hasFragmentIdentifier() const
751{
752 return m_fragmentEnd != m_queryEnd;
753}
754
755String URL::baseAsString() const
756{
757 return m_string.left(m_pathAfterLastSlash);
758}
759
760#if !PLATFORM(QT) && !USE(CF)
761String URL::fileSystemPath() const
762{
763 if (!isValid() || !isLocalFile())
764 return String();
765
766 return decodeURLEscapeSequences(path());
767}
768#endif
769
770#ifdef NDEBUG
771
772static inline void assertProtocolIsGood(const char*)
773{
774}
775
776#else
777
778static void assertProtocolIsGood(const char* protocol)
779{
780 const char* p = protocol;
781 while (*p) {
782 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
783 ++p;
784 }
785}
786
787#endif
788
789bool URL::protocolIs(const char* protocol) const
790{
791 assertProtocolIsGood(protocol);
792
793 // JavaScript URLs are "valid" and should be executed even if URL decides they are invalid.
794 // The free function protocolIsJavaScript() should be used instead.
795 ASSERT(!equalLettersIgnoringASCIICase(StringView(protocol), "javascript"));
796
797 if (!m_isValid)
798 return false;
799
800 // Do the comparison without making a new string object.
801 for (int i = 0; i < m_schemeEnd; ++i) {
802 if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i]))
803 return false;
804 }
805 return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
806}
807
808String URL::query() const
809{
810 if (m_queryEnd == m_pathEnd)
811 return String();
812
813 return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
814}
815
816String URL::path() const
817{
818 return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
819}
820
821bool URL::setProtocol(const String& s)
822{
823 // Firefox and IE remove everything after the first ':'.
824 size_t separatorPosition = s.find(':');
825 String newProtocol = s.substring(0, separatorPosition);
826
827 if (!isValidProtocol(newProtocol))
828 return false;
829
830 if (!m_isValid) {
831 parse(newProtocol + ':' + m_string);
832 return true;
833 }
834
835 parse(newProtocol + m_string.substring(m_schemeEnd));
836 return true;
837}
838
839void URL::setHost(const String& s)
840{
841 if (!m_isValid)
842 return;
843
844 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
845 // and to avoid changing more than just the host.
846
847 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
848
849 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd));
850}
851
852void URL::removePort()
853{
854 if (m_hostEnd == m_portEnd)
855 return;
856 parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
857}
858
859void URL::setPort(unsigned short i)
860{
861 if (!m_isValid)
862 return;
863
864 bool colonNeeded = m_portEnd == m_hostEnd;
865 int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
866
867 parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd));
868}
869
870void URL::setHostAndPort(const String& hostAndPort)
871{
872 if (!m_isValid)
873 return;
874
875 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
876 // and to avoid changing more than just host and port.
877
878 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
879
880 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd));
881}
882
883void URL::setUser(const String& user)
884{
885 if (!m_isValid)
886 return;
887
888 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
889 // and to avoid changing more than just the user login.
890
891 int end = m_userEnd;
892 if (!user.isEmpty()) {
893 String u = encodeWithURLEscapeSequences(user, PercentEncodeUsername);
894 if (m_userStart == m_schemeEnd + 1)
895 u = "//" + u;
896 // Add '@' if we didn't have one before.
897 if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
898 u.append('@');
899 parse(m_string.left(m_userStart) + u + m_string.substring(end));
900 } else {
901 // Remove '@' if we now have neither user nor password.
902 if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
903 end += 1;
904 // We don't want to parse in the extremely common case where we are not going to make a change.
905 if (m_userStart != end)
906 parse(m_string.left(m_userStart) + m_string.substring(end));
907 }
908}
909
910void URL::setPass(const String& password)
911{
912 if (!m_isValid)
913 return;
914
915 int end = m_passwordEnd;
916 if (!password.isEmpty()) {
917 String p = ":" + encodeWithURLEscapeSequences(password, PercentEncodePassword) + "@";
918 if (m_userEnd == m_schemeEnd + 1)
919 p = "//" + p;
920 // Eat the existing '@' since we are going to add our own.
921 if (end != m_hostEnd && m_string[end] == '@')
922 end += 1;
923 parse(m_string.left(m_userEnd) + p + m_string.substring(end));
924 } else {
925 // Remove '@' if we now have neither user nor password.
926 if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
927 end += 1;
928 // We don't want to parse in the extremely common case where we are not going to make a change.
929 if (m_userEnd != end)
930 parse(m_string.left(m_userEnd) + m_string.substring(end));
931 }
932}
933
934void URL::setFragmentIdentifier(const String& s)
935{
936 if (!m_isValid)
937 return;
938
939 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations.
940 parse(m_string.left(m_queryEnd) + "#" + s);
941}
942
943void URL::removeFragmentIdentifier()
944{
945 if (!m_isValid)
946 return;
947 parse(m_string.left(m_queryEnd));
948}
949
950void URL::setQuery(const String& query)
951{
952 if (!m_isValid)
953 return;
954
955 // FIXME: '#' and non-ASCII characters must be encoded and escaped.
956 // Usually, the query is encoded using document encoding, not UTF-8, but we don't have
957 // access to the document in this function.
958 if ((query.isEmpty() || query[0] != '?') && !query.isNull())
959 parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd));
960 else
961 parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd));
962
963}
964
965void URL::setPath(const String& s)
966{
967 if (!m_isValid)
968 return;
969
970 // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
971 // may be inadvertently affected.
972 String path = s;
973 if (path.isEmpty() || path[0] != '/')
974 path = "/" + path;
975
976 parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd));
977}
978
979String decodeURLEscapeSequences(const String& string)
980{
981 return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
982}
983
984String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
985{
986 return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
987}
988
989// Caution: This function does not bounds check.
990static void appendEscapedChar(char*& buffer, unsigned char c)
991{
992 *buffer++ = '%';
993 placeByteAsHex(c, buffer);
994}
995
996static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length)
997{
998 char* p = buffer;
999
1000 const char* str = strStart;
1001 const char* strEnd = strStart + length;
1002 while (str < strEnd) {
1003 unsigned char c = *str++;
1004 if (isBadChar(c)) {
1005 if (c == '%' || c == '?')
1006 *p++ = c;
1007 else if (c != 0x09 && c != 0x0a && c != 0x0d)
1008 appendEscapedChar(p, c);
1009 } else
1010 *p++ = c;
1011 }
1012
1013 buffer = p;
1014}
1015
1016static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length)
1017{
1018 char* p = buffer;
1019
1020 const char* str = strStart;
1021 const char* strEnd = strStart + length;
1022 while (str < strEnd) {
1023 unsigned char c = *str++;
1024 // Strip CR, LF and Tab from fragments, per:
1025 // https://bugs.webkit.org/show_bug.cgi?id=8770
1026 if (c == 0x09 || c == 0x0a || c == 0x0d)
1027 continue;
1028
1029 // Chrome and IE allow non-ascii characters in fragments, however doing
1030 // so would hit an ASSERT in checkEncodedString, so for now we don't.
1031 if (c < 0x20 || c >= 127) {
1032 appendEscapedChar(p, c);
1033 continue;
1034 }
1035 *p++ = c;
1036 }
1037
1038 buffer = p;
1039}
1040
1041// copy a path, accounting for "." and ".." segments
1042static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd)
1043{
1044 char* bufferPathStart = dst;
1045
1046 // empty path is a special case, and need not have a leading slash
1047 if (srcStart != srcEnd) {
1048 const char* baseStringStart = src + srcStart;
1049 const char* baseStringEnd = src + srcEnd;
1050 const char* baseStringPos = baseStringStart;
1051
1052 // this code is unprepared for paths that do not begin with a
1053 // slash and we should always have one in the source string
1054 ASSERT(baseStringPos[0] == '/');
1055
1056 // copy the leading slash into the destination
1057 *dst = *baseStringPos;
1058 baseStringPos++;
1059 dst++;
1060
1061 while (baseStringPos < baseStringEnd) {
1062 if (baseStringPos[0] == '.' && dst[-1] == '/') {
1063 if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) {
1064 // skip over "." segment
1065 baseStringPos += 2;
1066 continue;
1067 } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' ||
1068 baseStringPos + 2 == baseStringEnd)) {
1069 // skip over ".." segment and rewind the last segment
1070 // the RFC leaves it up to the app to decide what to do with excess
1071 // ".." segments - we choose to drop them since some web content
1072 // relies on this.
1073 baseStringPos += 3;
1074 if (dst > bufferPathStart + 1)
1075 dst--;
1076 while (dst > bufferPathStart && dst[-1] != '/')
1077 dst--;
1078 continue;
1079 }
1080 }
1081
1082 *dst = *baseStringPos;
1083 baseStringPos++;
1084 dst++;
1085 }
1086 }
1087 *dst = '\0';
1088 return dst - bufferPathStart;
1089}
1090
1091static inline bool hasSlashDotOrDotDot(const char* str)
1092{
1093 const unsigned char* p = reinterpret_cast<const unsigned char*>(str);
1094 if (!*p)
1095 return false;
1096 unsigned char pc = *p;
1097 while (unsigned char c = *++p) {
1098 if (c == '.' && (pc == '/' || pc == '.'))
1099 return true;
1100 pc = c;
1101 }
1102 return false;
1103}
1104
1105void URL::parse(const String& string)
1106{
1107 checkEncodedString(string);
1108
1109 CharBuffer buffer(string.length() + 1);
1110 copyASCII(string, buffer.data());
1111 buffer[string.length()] = '\0';
1112 parse(buffer.data(), &string);
1113}
1114
1115#if PLATFORM(IOS)
1116static bool shouldCanonicalizeScheme = true;
1117
1118void enableURLSchemeCanonicalization(bool enableSchemeCanonicalization)
1119{
1120 shouldCanonicalizeScheme = enableSchemeCanonicalization;
1121}
1122#endif
1123
1124template<size_t length>
1125static inline bool equal(const char* a, const char (&b)[length])
1126{
1127#if PLATFORM(IOS)
1128 if (!shouldCanonicalizeScheme) {
1129 for (size_t i = 0; i < length; ++i) {
1130 if (toASCIILower(a[i]) != b[i])
1131 return false;
1132 }
1133 return true;
1134 }
1135#endif
1136 for (size_t i = 0; i < length; ++i) {
1137 if (a[i] != b[i])
1138 return false;
1139 }
1140 return true;
1141}
1142
1143template<size_t lengthB>
1144static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
1145{
1146 return lengthA == lengthB && equal(stringA, stringB);
1147}
1148
1149// List of default schemes is taken from google-url:
1150// http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120
1151static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength)
1152{
1153 // This switch is theoretically a performance optimization. It came over when
1154 // the code was moved from google-url, but may be removed later.
1155 switch (schemeLength) {
1156 case 2:
1157 return equal(scheme, wsScheme) && equal(port, portLength, httpPort);
1158 case 3:
1159 if (equal(scheme, ftpScheme))
1160 return equal(port, portLength, ftpPort);
1161 if (equal(scheme, wssScheme))
1162 return equal(port, portLength, httpsPort);
1163 break;
1164 case 4:
1165 return equal(scheme, httpScheme) && equal(port, portLength, httpPort);
1166 case 5:
1167 return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort);
1168 case 6:
1169 return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort);
1170 }
1171 return false;
1172}
1173
1174static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userinfoEndChar)
1175{
1176 return userinfoEndChar == '@' && hostStart == portEnd;
1177}
1178
1179static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength)
1180{
1181 switch (schemeLength) {
1182 case 2:
1183 return equal(scheme, wsScheme);
1184 case 3:
1185 return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1186 case 4:
1187 return equal(scheme, httpScheme);
1188 case 5:
1189 return equal(scheme, httpsScheme);
1190 case 6:
1191 return equal(scheme, gopherScheme);
1192 }
1193 return false;
1194}
1195
1196static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength)
1197{
1198 switch (schemeLength) {
1199 case 2:
1200 return equal(scheme, wsScheme);
1201 case 3:
1202 return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1203 case 4:
1204 return equal(scheme, httpScheme) || equal(scheme, fileScheme);
1205 case 5:
1206 return equal(scheme, httpsScheme);
1207 case 6:
1208 return equal(scheme, gopherScheme);
1209 }
1210 return false;
1211}
1212
1213void URL::parse(const char* url, const String* originalString)
1214{
1215 if (!url || url[0] == '\0') {
1216 // valid URL must be non-empty
1217 m_string = originalString ? *originalString : url;
1218 invalidate();
1219 return;
1220 }
1221
1222 if (!isSchemeFirstChar(url[0])) {
1223 // scheme must start with an alphabetic character
1224 m_string = originalString ? *originalString : url;
1225 invalidate();
1226 return;
1227 }
1228
1229 int schemeEnd = 0;
1230 while (isSchemeChar(url[schemeEnd]))
1231 schemeEnd++;
1232
1233 if (url[schemeEnd] != ':') {
1234 m_string = originalString ? *originalString : url;
1235 invalidate();
1236 return;
1237 }
1238
1239 int userStart = schemeEnd + 1;
1240 int userEnd;
1241 int passwordStart;
1242 int passwordEnd;
1243 int hostStart;
1244 int hostEnd;
1245 int portStart;
1246 int portEnd;
1247
1248 bool hierarchical = url[schemeEnd + 1] == '/';
1249 bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/';
1250
1251 bool isFile = schemeEnd == 4
1252 && isLetterMatchIgnoringCase(url[0], 'f')
1253 && isLetterMatchIgnoringCase(url[1], 'i')
1254 && isLetterMatchIgnoringCase(url[2], 'l')
1255 && isLetterMatchIgnoringCase(url[3], 'e');
1256
1257 m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h')
1258 && isLetterMatchIgnoringCase(url[1], 't')
1259 && isLetterMatchIgnoringCase(url[2], 't')
1260 && isLetterMatchIgnoringCase(url[3], 'p')
1261 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1262
1263 if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) {
1264 // The part after the scheme is either a net_path or an abs_path whose first path segment is empty.
1265 // Attempt to find an authority.
1266 // FIXME: Authority characters may be scanned twice, and it would be nice to be faster.
1267
1268 if (hierarchical) {
1269 userStart++;
1270 if (hasSecondSlash) {
1271 userStart++;
1272 if (isNonFileHierarchicalScheme(url, schemeEnd)) {
1273 while (url[userStart] == '/')
1274 userStart++;
1275 }
1276 }
1277 }
1278
1279 userEnd = userStart;
1280
1281 int colonPos = 0;
1282 while (isUserInfoChar(url[userEnd])) {
1283 if (url[userEnd] == ':' && colonPos == 0)
1284 colonPos = userEnd;
1285 userEnd++;
1286 }
1287
1288 if (url[userEnd] == '@') {
1289 // actual end of the userinfo, start on the host
1290 if (colonPos != 0) {
1291 passwordEnd = userEnd;
1292 userEnd = colonPos;
1293 passwordStart = colonPos + 1;
1294 } else
1295 passwordStart = passwordEnd = userEnd;
1296
1297 hostStart = passwordEnd + 1;
1298 } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) {
1299 // hit the end of the authority, must have been no user
1300 // or looks like an IPv6 hostname
1301 // either way, try to parse it as a hostname
1302 userEnd = userStart;
1303 passwordStart = passwordEnd = userEnd;
1304 hostStart = userStart;
1305 } else {
1306 // invalid character
1307 m_string = originalString ? *originalString : url;
1308 invalidate();
1309 return;
1310 }
1311
1312 hostEnd = hostStart;
1313
1314 // IPV6 IP address
1315 if (url[hostEnd] == '[') {
1316 hostEnd++;
1317 while (isIPv6Char(url[hostEnd]))
1318 hostEnd++;
1319 if (url[hostEnd] == ']')
1320 hostEnd++;
1321 else {
1322 // invalid character
1323 m_string = originalString ? *originalString : url;
1324 invalidate();
1325 return;
1326 }
1327 } else {
1328 while (isHostnameChar(url[hostEnd]))
1329 hostEnd++;
1330 }
1331
1332 if (url[hostEnd] == ':') {
1333 portStart = portEnd = hostEnd + 1;
1334
1335 // possible start of port
1336 portEnd = portStart;
1337 while (isASCIIDigit(url[portEnd]))
1338 portEnd++;
1339 } else
1340 portStart = portEnd = hostEnd;
1341
1342 if (!isPathSegmentEndChar(url[portEnd])) {
1343 // invalid character
1344 m_string = originalString ? *originalString : url;
1345 invalidate();
1346 return;
1347 }
1348
1349 if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[passwordEnd])) {
1350 m_string = originalString ? *originalString : url;
1351 invalidate();
1352 return;
1353 }
1354
1355 if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) {
1356 // No authority found, which means that this is not a net_path, but rather an abs_path whose first two
1357 // path segments are empty. For file, http and https only, an empty authority is allowed.
1358 userStart -= 2;
1359 userEnd = userStart;
1360 passwordStart = userEnd;
1361 passwordEnd = passwordStart;
1362 hostStart = passwordEnd;
1363 hostEnd = hostStart;
1364 portStart = hostEnd;
1365 portEnd = hostEnd;
1366 }
1367 } else {
1368 // the part after the scheme must be an opaque_part or an abs_path
1369 userEnd = userStart;
1370 passwordStart = passwordEnd = userEnd;
1371 hostStart = hostEnd = passwordEnd;
1372 portStart = portEnd = hostEnd;
1373 }
1374
1375 int pathStart = portEnd;
1376 int pathEnd = pathStart;
1377 while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#')
1378 pathEnd++;
1379
1380 int queryStart = pathEnd;
1381 int queryEnd = queryStart;
1382 if (url[queryStart] == '?') {
1383 while (url[queryEnd] && url[queryEnd] != '#')
1384 queryEnd++;
1385 }
1386
1387 int fragmentStart = queryEnd;
1388 int fragmentEnd = fragmentStart;
1389 if (url[fragmentStart] == '#') {
1390 fragmentStart++;
1391 fragmentEnd = fragmentStart;
1392 while (url[fragmentEnd])
1393 fragmentEnd++;
1394 }
1395
1396 // assemble it all, remembering the real ranges
1397
1398 Vector<char, 4096> buffer(fragmentEnd * 3 + 1);
1399
1400 char *p = buffer.data();
1401 const char *strPtr = url;
1402
1403 // copy in the scheme
1404 const char *schemeEndPtr = url + schemeEnd;
1405#if PLATFORM(IOS)
1406 if (shouldCanonicalizeScheme || m_protocolIsInHTTPFamily) {
1407 while (strPtr < schemeEndPtr)
1408 *p++ = toASCIILower(*strPtr++);
1409 } else {
1410 while (strPtr < schemeEndPtr)
1411 *p++ = *strPtr++;
1412 }
1413#else
1414 while (strPtr < schemeEndPtr)
1415 *p++ = toASCIILower(*strPtr++);
1416#endif
1417 m_schemeEnd = p - buffer.data();
1418
1419 bool hostIsLocalHost = portEnd - userStart == 9
1420 && isLetterMatchIgnoringCase(url[userStart], 'l')
1421 && isLetterMatchIgnoringCase(url[userStart+1], 'o')
1422 && isLetterMatchIgnoringCase(url[userStart+2], 'c')
1423 && isLetterMatchIgnoringCase(url[userStart+3], 'a')
1424 && isLetterMatchIgnoringCase(url[userStart+4], 'l')
1425 && isLetterMatchIgnoringCase(url[userStart+5], 'h')
1426 && isLetterMatchIgnoringCase(url[userStart+6], 'o')
1427 && isLetterMatchIgnoringCase(url[userStart+7], 's')
1428 && isLetterMatchIgnoringCase(url[userStart+8], 't');
1429
1430 // File URLs need a host part unless it is just file:// or file://localhost
1431 bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost);
1432
1433 // We drop empty credentials, but keep a colon in an empty host/port pair.
1434 // Removing hostname completely would change the structure of the URL on re-parsing.
1435 bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || hostEnd != portEnd;
1436
1437 // add ":" after scheme
1438 *p++ = ':';
1439
1440 // if we have at least one authority part or a file URL - add "//" and authority
1441 if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) {
1442 *p++ = '/';
1443 *p++ = '/';
1444
1445 m_userStart = p - buffer.data();
1446
1447 // copy in the user
1448 strPtr = url + userStart;
1449 const char* userEndPtr = url + userEnd;
1450 while (strPtr < userEndPtr) {
1451 char c = *strPtr++;
1452 ASSERT(isUserInfoChar(c));
1453 *p++ = c;
1454 }
1455 m_userEnd = p - buffer.data();
1456
1457 // copy in the password
1458 if (passwordEnd != passwordStart) {
1459 *p++ = ':';
1460 strPtr = url + passwordStart;
1461 const char* passwordEndPtr = url + passwordEnd;
1462 while (strPtr < passwordEndPtr) {
1463 char c = *strPtr++;
1464 ASSERT(isUserInfoChar(c));
1465 *p++ = c;
1466 }
1467 }
1468 m_passwordEnd = p - buffer.data();
1469
1470 // If we had any user info, add "@"
1471 if (p - buffer.data() != m_userStart)
1472 *p++ = '@';
1473
1474 // copy in the host, except in the case of a file URL with authority="localhost"
1475 if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) {
1476 strPtr = url + hostStart;
1477 const char* hostEndPtr = url + hostEnd;
1478 if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) {
1479 while (strPtr < hostEndPtr) {
1480 char c = toASCIILower(*strPtr++);
1481 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1482 *p++ = c;
1483 }
1484 } else {
1485 while (strPtr < hostEndPtr) {
1486 char c = *strPtr++;
1487 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1488 *p++ = c;
1489 }
1490 }
1491 }
1492 m_hostEnd = p - buffer.data();
1493
1494 // Copy in the port if the URL has one (and it's not default). Also, copy it if there was no hostname, so that there is still something in authority component.
1495 if (hostEnd != portStart) {
1496 const char* portStr = url + portStart;
1497 size_t portLength = portEnd - portStart;
1498 if ((portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd))
1499 || (hostStart == hostEnd && hostEnd != portStart)) {
1500 *p++ = ':';
1501 const char* portEndPtr = url + portEnd;
1502 while (portStr < portEndPtr)
1503 *p++ = *portStr++;
1504 }
1505 }
1506 m_portEnd = p - buffer.data();
1507 } else {
1508 if (isFile) {
1509 ASSERT(degenerateFilePath);
1510 *p++ = '/';
1511 *p++ = '/';
1512 }
1513 m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data();
1514 }
1515
1516 // For canonicalization, ensure we have a '/' for no path.
1517 // Do this only for URL with protocol file, http or https.
1518 if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart)
1519 *p++ = '/';
1520
1521 // add path, escaping bad characters
1522 if (!hierarchical)
1523 escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart);
1524 else if (!hasSlashDotOrDotDot(url))
1525 appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart);
1526 else {
1527 CharBuffer pathBuffer(pathEnd - pathStart + 1);
1528 size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd);
1529 appendEscapingBadChars(p, pathBuffer.data(), length);
1530 }
1531
1532 m_pathEnd = p - buffer.data();
1533
1534 // Find the position after the last slash in the path, or
1535 // the position before the path if there are no slashes in it.
1536 int i;
1537 for (i = m_pathEnd; i > m_portEnd; --i) {
1538 if (buffer[i - 1] == '/')
1539 break;
1540 }
1541 m_pathAfterLastSlash = i;
1542
1543 // add query, escaping bad characters
1544 appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart);
1545 m_queryEnd = p - buffer.data();
1546
1547 // add fragment, escaping bad characters
1548 if (fragmentEnd != queryEnd) {
1549 *p++ = '#';
1550 escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart);
1551 }
1552 m_fragmentEnd = p - buffer.data();
1553
1554 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1555 ASSERT(buffer.size() > 0);
1556
1557 // If we didn't end up actually changing the original string and
1558 // it was already in a String, reuse it to avoid extra allocation.
1559 if (originalString && equal(originalString->impl(), buffer.data(), m_fragmentEnd))
1560 m_string = *originalString;
1561 else
1562 m_string = String(buffer.data(), m_fragmentEnd);
1563
1564 m_isValid = true;
1565}
1566
1567bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b)
1568{
1569 if (a.m_queryEnd != b.m_queryEnd)
1570 return false;
1571 unsigned queryLength = a.m_queryEnd;
1572 for (unsigned i = 0; i < queryLength; ++i)
1573 if (a.string()[i] != b.string()[i])
1574 return false;
1575 return true;
1576}
1577
1578bool protocolHostAndPortAreEqual(const URL& a, const URL& b)
1579{
1580 if (a.m_schemeEnd != b.m_schemeEnd)
1581 return false;
1582
1583 int hostStartA = a.hostStart();
1584 int hostLengthA = a.hostEnd() - hostStartA;
1585 int hostStartB = b.hostStart();
1586 int hostLengthB = b.hostEnd() - b.hostStart();
1587 if (hostLengthA != hostLengthB)
1588 return false;
1589
1590 // Check the scheme
1591 for (int i = 0; i < a.m_schemeEnd; ++i)
1592 if (a.string()[i] != b.string()[i])
1593 return false;
1594
1595 // And the host
1596 for (int i = 0; i < hostLengthA; ++i)
1597 if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
1598 return false;
1599
1600 if (a.port() != b.port())
1601 return false;
1602
1603 return true;
1604}
1605
1606bool hostsAreEqual(const URL& a, const URL& b)
1607{
1608 int hostStartA = a.hostStart();
1609 int hostLengthA = a.hostEnd() - hostStartA;
1610 int hostStartB = b.hostStart();
1611 int hostLengthB = b.hostEnd() - hostStartB;
1612 if (hostLengthA != hostLengthB)
1613 return false;
1614
1615 for (int i = 0; i < hostLengthA; ++i) {
1616 if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
1617 return false;
1618 }
1619
1620 return true;
1621}
1622
1623String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode)
1624{
1625 CString asUTF8 = notEncodedString.utf8();
1626
1627 CharBuffer buffer(asUTF8.length() * 3 + 1);
1628 char* p = buffer.data();
1629
1630 const char* str = asUTF8.data();
1631 const char* strEnd = str + asUTF8.length();
1632 while (str < strEnd) {
1633 unsigned char c = *str++;
1634 if (percentEncodeClassTable[c] >= whatToEncode)
1635 appendEscapedChar(p, c);
1636 else
1637 *p++ = c;
1638 }
1639
1640 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1641
1642 return String(buffer.data(), p - buffer.data());
1643}
1644
1645String encodeWithURLEscapeSequences(const String& notEncodedString)
1646{
1647 CString asUTF8 = notEncodedString.utf8();
1648
1649 CharBuffer buffer(asUTF8.length() * 3 + 1);
1650 char* p = buffer.data();
1651
1652 const char* str = asUTF8.data();
1653 const char* strEnd = str + asUTF8.length();
1654 while (str < strEnd) {
1655 unsigned char c = *str++;
1656 if (isBadChar(c))
1657 appendEscapedChar(p, c);
1658 else
1659 *p++ = c;
1660 }
1661
1662 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1663
1664 return String(buffer.data(), p - buffer.data());
1665}
1666
1667static bool containsOnlyASCII(StringView string)
1668{
1669 if (string.is8Bit())
1670 return charactersAreAllASCII(string.characters8(), string.length());
1671 return charactersAreAllASCII(string.characters16(), string.length());
1672}
1673
1674static bool protocolIs(StringView stringURL, const char* protocol)
1675{
1676 assertProtocolIsGood(protocol);
1677 unsigned length = stringURL.length();
1678 for (unsigned i = 0; i < length; ++i) {
1679 if (!protocol[i])
1680 return stringURL[i] == ':';
1681 if (!isLetterMatchIgnoringCase(stringURL[i], protocol[i]))
1682 return false;
1683 }
1684 return false;
1685}
1686
1687// Appends the punycoded hostname identified by the given string and length to
1688// the output buffer. The result will not be null terminated.
1689// Return value of false means error in encoding.
1690static bool appendEncodedHostname(UCharBuffer& buffer, StringView string)
1691{
1692 // Needs to be big enough to hold an IDN-encoded name.
1693 // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
1694 const unsigned hostnameBufferLength = 2048;
1695
1696 if (string.length() > hostnameBufferLength || containsOnlyASCII(string)) {
1697 append(buffer, string);
1698 return true;
1699 }
1700
1701 UChar hostnameBuffer[hostnameBufferLength];
1702 UErrorCode error = U_ZERO_ERROR;
1703
1704#if COMPILER(GCC_OR_CLANG)
1705#pragma GCC diagnostic push
1706#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
1707#endif
1708 int32_t numCharactersConverted = uidna_IDNToASCII(string.upconvertedCharacters(), string.length(), hostnameBuffer,
1709 hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
1710#if COMPILER(GCC_OR_CLANG)
1711#pragma GCC diagnostic pop
1712#endif
1713
1714 if (error == U_ZERO_ERROR) {
1715 buffer.append(hostnameBuffer, numCharactersConverted);
1716 return true;
1717 }
1718 return false;
1719}
1720
1721static void findHostnamesInMailToURL(StringView string, Vector<std::pair<int, int>>& nameRanges)
1722{
1723 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character.
1724 // Skip quoted strings so that characters in them don't confuse us.
1725 // When we find a '?' character, we are past the part of the URL that contains host names.
1726
1727 nameRanges.clear();
1728
1729 int p = 0;
1730 while (1) {
1731 // Find start of host name or of quoted string.
1732 int hostnameOrStringStart = findFirstOf(string, p, "\"@?");
1733 if (hostnameOrStringStart == -1)
1734 return;
1735 UChar c = string[hostnameOrStringStart];
1736 p = hostnameOrStringStart + 1;
1737
1738 if (c == '?')
1739 return;
1740
1741 if (c == '@') {
1742 // Find end of host name.
1743 int hostnameStart = p;
1744 int hostnameEnd = findFirstOf(string, p, ">,?");
1745 bool done;
1746 if (hostnameEnd == -1) {
1747 hostnameEnd = string.length();
1748 done = true;
1749 } else {
1750 p = hostnameEnd;
1751 done = false;
1752 }
1753
1754 nameRanges.append(std::make_pair(hostnameStart, hostnameEnd));
1755
1756 if (done)
1757 return;
1758 } else {
1759 // Skip quoted string.
1760 ASSERT(c == '"');
1761 while (1) {
1762 int escapedCharacterOrStringEnd = findFirstOf(string, p, "\"\\");
1763 if (escapedCharacterOrStringEnd == -1)
1764 return;
1765
1766 c = string[escapedCharacterOrStringEnd];
1767 p = escapedCharacterOrStringEnd + 1;
1768
1769 // If we are the end of the string, then break from the string loop back to the host name loop.
1770 if (c == '"')
1771 break;
1772
1773 // Skip escaped character.
1774 ASSERT(c == '\\');
1775 if (p == static_cast<int>(string.length()))
1776 return;
1777
1778 ++p;
1779 }
1780 }
1781 }
1782}
1783
1784static bool findHostnameInHierarchicalURL(StringView string, int& startOffset, int& endOffset)
1785{
1786 // Find the host name in a hierarchical URL.
1787 // It comes after a "://" sequence, with scheme characters preceding, and
1788 // this should be the first colon in the string.
1789 // It ends with the end of the string or a ":" or a path segment ending character.
1790 // If there is a "@" character, the host part is just the part after the "@".
1791 int separator = findFirstOf(string, 0, ":");
1792 if (separator == -1 || separator + 2 >= static_cast<int>(string.length()) || string[separator + 1] != '/' || string[separator + 2] != '/')
1793 return false;
1794
1795 // Check that all characters before the :// are valid scheme characters.
1796 if (!isSchemeFirstChar(string[0]))
1797 return false;
1798 for (int i = 1; i < separator; ++i) {
1799 if (!isSchemeChar(string[i]))
1800 return false;
1801 }
1802
1803 // Start after the separator.
1804 int authorityStart = separator + 3;
1805
1806 // Find terminating character.
1807 int hostnameEnd = string.length();
1808 for (int i = authorityStart; i < hostnameEnd; ++i) {
1809 UChar c = string[i];
1810 if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) {
1811 hostnameEnd = i;
1812 break;
1813 }
1814 }
1815
1816 // Find "@" for the start of the host name.
1817 int userInfoTerminator = findFirstOf(string, authorityStart, "@");
1818 int hostnameStart;
1819 if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd)
1820 hostnameStart = authorityStart;
1821 else
1822 hostnameStart = userInfoTerminator + 1;
1823
1824 startOffset = hostnameStart;
1825 endOffset = hostnameEnd;
1826 return true;
1827}
1828
1829// Converts all hostnames found in the given input to punycode, preserving the
1830// rest of the URL unchanged. The output will NOT be null-terminated.
1831// Return value of false means error in encoding.
1832static bool encodeHostnames(StringView string, UCharBuffer& buffer)
1833{
1834 buffer.clear();
1835
1836 if (protocolIs(string, "mailto")) {
1837 Vector<std::pair<int, int>> hostnameRanges;
1838 findHostnamesInMailToURL(string, hostnameRanges);
1839 int n = hostnameRanges.size();
1840 int p = 0;
1841 for (int i = 0; i < n; ++i) {
1842 const std::pair<int, int>& r = hostnameRanges[i];
1843 append(buffer, string.substring(p, r.first - p));
1844 if (!appendEncodedHostname(buffer, string.substring(r.first, r.second - r.first)))
1845 return false;
1846 p = r.second;
1847 }
1848 // This will copy either everything after the last hostname, or the
1849 // whole thing if there is no hostname.
1850 append(buffer, string.substring(p));
1851 } else {
1852 int hostStart, hostEnd;
1853 if (findHostnameInHierarchicalURL(string, hostStart, hostEnd)) {
1854 append(buffer, string.substring(0, hostStart)); // Before hostname.
1855 if (!appendEncodedHostname(buffer, string.substring(hostStart, hostEnd - hostStart)))
1856 return false;
1857 append(buffer, string.substring(hostEnd)); // After hostname.
1858 } else {
1859 // No hostname to encode, return the input.
1860 append(buffer, string);
1861 }
1862 }
1863
1864 return true;
1865}
1866
1867// Return value of false means error in encoding.
1868static bool encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output)
1869{
1870 UCharBuffer s;
1871 if (!encodeHostnames(rel, s))
1872 return false;
1873
1874 TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme.
1875
1876 int pathEnd = -1;
1877 if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) {
1878 // Find the first instance of either # or ?, keep pathEnd at -1 otherwise.
1879 pathEnd = findFirstOf(StringView(s.data(), s.size()), 0, "#?");
1880 }
1881
1882 if (pathEnd == -1) {
1883 CString decoded = pathEncoding.encode(StringView(s.data(), s.size()), URLEncodedEntitiesForUnencodables);
1884 output.resize(decoded.length());
1885 memcpy(output.data(), decoded.data(), decoded.length());
1886 } else {
1887 CString pathDecoded = pathEncoding.encode(StringView(s.data(), pathEnd), URLEncodedEntitiesForUnencodables);
1888 // Unencodable characters in URLs are represented by converting
1889 // them to XML entities and escaping non-alphanumeric characters.
1890 CString otherDecoded = encoding.encode(StringView(s.data() + pathEnd, s.size() - pathEnd), URLEncodedEntitiesForUnencodables);
1891
1892 output.resize(pathDecoded.length() + otherDecoded.length());
1893 memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
1894 memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length());
1895 }
1896 output.append('\0'); // null-terminate the output.
1897
1898 return true;
1899}
1900
1901static String substituteBackslashes(const String& string)
1902{
1903 size_t questionPos = string.find('?');
1904 size_t hashPos = string.find('#');
1905 unsigned pathEnd;
1906
1907 if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos))
1908 pathEnd = hashPos;
1909 else if (questionPos != notFound)
1910 pathEnd = questionPos;
1911 else
1912 pathEnd = string.length();
1913
1914 return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd);
1915}
1916
1917bool URL::isHierarchical() const
1918{
1919 if (!m_isValid)
1920 return false;
1921 ASSERT(m_string[m_schemeEnd] == ':');
1922 return m_string[m_schemeEnd + 1] == '/';
1923}
1924
1925void URL::copyToBuffer(Vector<char, 512>& buffer) const
1926{
1927 // FIXME: This throws away the high bytes of all the characters in the string!
1928 // That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
1929 buffer.resize(m_string.length());
1930 copyASCII(m_string, buffer.data());
1931}
1932
1933bool protocolIs(const String& url, const char* protocol)
1934{
1935 // Do the comparison without making a new string object.
1936 assertProtocolIsGood(protocol);
1937 bool isLeading = true;
1938 for (int i = 0, j = 0; url[i]; ++i) {
1939 // skip leading whitespace and control characters.
1940 if (isLeading && shouldTrimFromURL(url[i]))
1941 continue;
1942 isLeading = false;
1943
1944 // skip any tabs and newlines.
1945 if (isTabNewline(url[i]))
1946 continue;
1947
1948 if (!protocol[j])
1949 return url[i] == ':';
1950 if (!isLetterMatchIgnoringCase(url[i], protocol[j]))
1951 return false;
1952
1953 ++j;
1954 }
1955
1956 return false;
1957}
1958
1959bool isValidProtocol(const String& protocol)
1960{
1961 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
1962 if (protocol.isEmpty())
1963 return false;
1964 if (!isSchemeFirstChar(protocol[0]))
1965 return false;
1966 unsigned protocolLength = protocol.length();
1967 for (unsigned i = 1; i < protocolLength; i++) {
1968 if (!isSchemeChar(protocol[i]))
1969 return false;
1970 }
1971 return true;
1972}
1973
1974#ifndef NDEBUG
1975void URL::print() const
1976{
1977 printf("%s\n", m_string.utf8().data());
1978}
1979#endif
1980
1981String URL::strippedForUseAsReferrer() const
1982{
1983 URL referrer(*this);
1984 referrer.setUser(String());
1985 referrer.setPass(String());
1986 referrer.removeFragmentIdentifier();
1987 return referrer.string();
1988}
1989
1990bool URL::isLocalFile() const
1991{
1992 // Including feed here might be a bad idea since drag and drop uses this check
1993 // and including feed would allow feeds to potentially let someone's blog
1994 // read the contents of the clipboard on a drag, even without a drop.
1995 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
1996#if PLATFORM(QT)
1997 return protocolIs("file") || protocolIs("qrc");
1998#else
1999 return protocolIs("file");
2000#endif
2001}
2002
2003bool protocolIsJavaScript(const String& url)
2004{
2005 return protocolIs(url, "javascript");
2006}
2007
2008bool protocolIsInHTTPFamily(const String& url)
2009{
2010 // Do the comparison without making a new string object.
2011 return isLetterMatchIgnoringCase(url[0], 'h')
2012 && isLetterMatchIgnoringCase(url[1], 't')
2013 && isLetterMatchIgnoringCase(url[2], 't')
2014 && isLetterMatchIgnoringCase(url[3], 'p')
2015 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
2016}
2017
2018const URL& blankURL()
2019{
2020 static NeverDestroyed<URL> staticBlankURL(ParsedURLString, "about:blank");
2021 return staticBlankURL;
2022}
2023
2024bool URL::isBlankURL() const
2025{
2026 return protocolIs("about");
2027}
2028
2029typedef HashMap<String, unsigned short, ASCIICaseInsensitiveHash> DefaultPortsMap;
2030static const DefaultPortsMap& defaultPortsMap()
2031{
2032 static NeverDestroyed<const DefaultPortsMap> defaultPortsMap(DefaultPortsMap({
2033 { "http", 80 },
2034 { "https", 443 },
2035 { "ftp", 21 },
2036 { "ftps", 990 }
2037 }));
2038 return defaultPortsMap.get();
2039}
2040unsigned short defaultPortForProtocol(const String& protocol)
2041{
2042 return defaultPortsMap().get(protocol);
2043}
2044
2045bool isDefaultPortForProtocol(unsigned short port, const String& protocol)
2046{
2047 if (protocol.isEmpty())
2048 return false;
2049
2050 return defaultPortForProtocol(protocol) == port;
2051}
2052
2053bool portAllowed(const URL& url)
2054{
2055 unsigned short port = url.port();
2056
2057 // Since most URLs don't have a port, return early for the "no port" case.
2058 if (!port)
2059 return true;
2060
2061 // This blocked port list matches the port blocking that Mozilla implements.
2062 // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
2063 static const unsigned short blockedPortList[] = {
2064 1, // tcpmux
2065 7, // echo
2066 9, // discard
2067 11, // systat
2068 13, // daytime
2069 15, // netstat
2070 17, // qotd
2071 19, // chargen
2072 20, // FTP-data
2073 21, // FTP-control
2074 22, // SSH
2075 23, // telnet
2076 25, // SMTP
2077 37, // time
2078 42, // name
2079 43, // nicname
2080 53, // domain
2081 77, // priv-rjs
2082 79, // finger
2083 87, // ttylink
2084 95, // supdup
2085 101, // hostriame
2086 102, // iso-tsap
2087 103, // gppitnp
2088 104, // acr-nema
2089 109, // POP2
2090 110, // POP3
2091 111, // sunrpc
2092 113, // auth
2093 115, // SFTP
2094 117, // uucp-path
2095 119, // nntp
2096 123, // NTP
2097 135, // loc-srv / epmap
2098 139, // netbios
2099 143, // IMAP2
2100 179, // BGP
2101 389, // LDAP
2102 465, // SMTP+SSL
2103 512, // print / exec
2104 513, // login
2105 514, // shell
2106 515, // printer
2107 526, // tempo
2108 530, // courier
2109 531, // Chat
2110 532, // netnews
2111 540, // UUCP
2112 556, // remotefs
2113 563, // NNTP+SSL
2114 587, // ESMTP
2115 601, // syslog-conn
2116 636, // LDAP+SSL
2117 993, // IMAP+SSL
2118 995, // POP3+SSL
2119 2049, // NFS
2120 3659, // apple-sasl / PasswordServer [Apple addition]
2121 4045, // lockd
2122 4190, // ManageSieve [Apple addition]
2123 6000, // X11
2124 6665, // Alternate IRC [Apple addition]
2125 6666, // Alternate IRC [Apple addition]
2126 6667, // Standard IRC [Apple addition]
2127 6668, // Alternate IRC [Apple addition]
2128 6669, // Alternate IRC [Apple addition]
2129 invalidPortNumber, // Used to block all invalid port numbers
2130 };
2131 const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList);
2132
2133#ifndef NDEBUG
2134 // The port list must be sorted for binary_search to work.
2135 static bool checkedPortList = false;
2136 if (!checkedPortList) {
2137 for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p)
2138 ASSERT(*p < *(p + 1));
2139 checkedPortList = true;
2140 }
2141#endif
2142
2143 // If the port is not in the blocked port list, allow it.
2144 if (!std::binary_search(blockedPortList, blockedPortListEnd, port))
2145 return true;
2146
2147 // Allow ports 21 and 22 for FTP URLs, as Mozilla does.
2148 if ((port == 21 || port == 22) && url.protocolIs("ftp"))
2149 return true;
2150
2151 // Allow any port number in a file URL, since the port number is ignored.
2152 if (url.protocolIs("file"))
2153 return true;
2154
2155 return false;
2156}
2157
2158String mimeTypeFromDataURL(const String& url)
2159{
2160 ASSERT(protocolIs(url, "data"));
2161
2162 // FIXME: What's the right behavior when the URL has a comma first, but a semicolon later?
2163 // Currently this code will break at the semicolon in that case. Not sure that's correct.
2164 auto index = url.find(';', 5);
2165 if (index == notFound)
2166 index = url.find(',', 5);
2167 if (index == notFound) {
2168 // FIXME: There was an old comment here that made it sound like this should be returning text/plain.
2169 // But we have been returning empty string here for some time, so not changing its behavior at this time.
2170 return emptyString();
2171 }
2172 if (index == 5)
2173 return ASCIILiteral("text/plain");
2174 ASSERT(index >= 5);
2175 return url.substring(5, index - 5).convertToASCIILowercase();
2176}
2177
2178String mimeTypeFromURL(const URL& url)
2179{
2180 String decodedPath = decodeURLEscapeSequences(url.path());
2181 String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1);
2182
2183 // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure
2184 return MIMETypeRegistry::getMIMETypeForExtension(extension);
2185}
2186
2187bool URL::isSafeToSendToAnotherThread() const
2188{
2189 return m_string.isSafeToSendToAnotherThread();
2190}
2191
2192String URL::stringCenterEllipsizedToLength(unsigned length) const
2193{
2194 if (string().length() <= length)
2195 return string();
2196
2197 return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2);
2198}
2199
2200URL URL::fakeURLWithRelativePart(const String& relativePart)
2201{
2202 return URL(URL(), "webkit-fake-url://" + createCanonicalUUIDString() + '/' + relativePart);
2203}
2204
2205URL URL::fileURLWithFileSystemPath(const String& filePath)
2206{
2207 return URL(URL(), "file:///" + filePath);
2208}
2209
2210}
2211