1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4#ifndef EXTENSIONS_COMMON_URL_PATTERN_H_
5#define EXTENSIONS_COMMON_URL_PATTERN_H_
6
7#include <functional>
8#include <iosfwd>
9#include <string>
10#include <vector>
11
12#include "base/strings/string_piece.h"
13#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
14
15class GURL;
16
17// A pattern that can be used to match URLs. A URLPattern is a very restricted
18// subset of URL syntax:
19//
20// <url-pattern> := <scheme>://<host><port><path> | '<all_urls>'
21// <scheme> := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome' |
22// 'chrome-extension' | 'filesystem'
23// <host> := '*' | <IPv4 address> | [<IPv6 address>] |
24// '*.' <anychar except '/' and '*'>+
25// <port> := [':' ('*' | <port number between 0 and 65535>)]
26// <path> := '/' <any chars>
27//
28// * Host is not used when the scheme is 'file'.
29// * The path can have embedded '*' characters which act as glob wildcards.
30// * '<all_urls>' is a special pattern that matches any URL that contains a
31// valid scheme (as specified by valid_schemes_).
32// * The '*' scheme pattern excludes file URLs.
33//
34// Examples of valid patterns:
35// - http://*/*
36// - http://*/foo*
37// - https://*.google.com/foo*bar
38// - file://monkey*
39// - http://127.0.0.1/*
40// - http://[2607:f8b0:4005:805::200e]/*
41//
42// Examples of invalid patterns:
43// - http://* -- path not specified
44// - http://*foo/bar -- * not allowed as substring of host component
45// - http://foo.*.bar/baz -- * must be first component
46// - http:/bar -- scheme separator not found
47// - foo://* -- invalid scheme
48// - chrome:// -- we don't support chrome internal URLs
49class URLPattern {
50 public:
51 // A collection of scheme bitmasks for use with valid_schemes.
52 enum SchemeMasks {
53 SCHEME_NONE = 0,
54 SCHEME_HTTP = 1 << 0,
55 SCHEME_HTTPS = 1 << 1,
56 SCHEME_FILE = 1 << 2,
57 SCHEME_FTP = 1 << 3,
58 SCHEME_CHROMEUI = 1 << 4,
59 SCHEME_EXTENSION = 1 << 5,
60 SCHEME_FILESYSTEM = 1 << 6,
61 SCHEME_WS = 1 << 7,
62 SCHEME_WSS = 1 << 8,
63 SCHEME_DATA = 1 << 9,
64
65 // IMPORTANT!
66 // SCHEME_ALL will match every scheme, including chrome://, chrome-
67 // extension://, about:, etc. Because this has lots of security
68 // implications, third-party extensions should usually not be able to get
69 // access to URL patterns initialized this way. If there is a reason
70 // for violating this general rule, document why this it safe.
71 SCHEME_ALL = -1,
72 };
73
74 // Error codes returned from Parse().
75 enum class ParseResult {
76 kSuccess = 0,
77 kMissingSchemeSeparator,
78 kInvalidScheme,
79 kWrongSchemeSeparator,
80 kEmptyHost,
81 kInvalidHostWildcard,
82 kEmptyPath,
83 kInvalidPort,
84 kInvalidHost,
85 kNumParseResults,
86 };
87
88 // Types of URLPattern that Parse() considers valid.
89 enum ParseOptions {
90 DENY_WILDCARD_FOR_EFFECTIVE_TLD,
91 ALLOW_WILDCARD_FOR_EFFECTIVE_TLD,
92 };
93
94 // The <all_urls> string pattern.
95 static const char kAllUrlsPattern[];
96
97 // Returns true if the given |scheme| is considered valid for extensions.
98 static bool IsValidSchemeForExtensions(base::StringPiece scheme);
99
100 // Returns the mask for all schemes considered valid for extensions.
101 static int GetValidSchemeMaskForExtensions();
102
103 explicit URLPattern(int valid_schemes);
104
105 // Convenience to construct a URLPattern from a string. If the string is not
106 // known ahead of time, use Parse() instead, which returns success or failure.
107 URLPattern(int valid_schemes, base::StringPiece pattern);
108
109 URLPattern();
110 URLPattern(const URLPattern& other);
111 URLPattern(URLPattern&& other);
112 ~URLPattern();
113
114 URLPattern& operator=(const URLPattern& other);
115 URLPattern& operator=(URLPattern&& other);
116
117 bool operator<(const URLPattern& other) const;
118 bool operator>(const URLPattern& other) const;
119 bool operator==(const URLPattern& other) const;
120
121 // Initializes this instance by parsing the provided string. Returns
122 // URLPattern::ParseResult::kSuccess on success, or an error code otherwise.
123 // On failure, this instance will have some intermediate values and is in an
124 // invalid state. If you want to allow the match pattern to specify a wildcard
125 // for the effective TLD, specify in |parse_options|.
126 ParseResult Parse(base::StringPiece pattern_str);
127 ParseResult Parse(base::StringPiece pattern_str, ParseOptions parse_options);
128
129 // Gets the bitmask of valid schemes.
130 int valid_schemes() const { return valid_schemes_; }
131 void SetValidSchemes(int valid_schemes);
132
133 // Gets the host the pattern matches. This can be an empty string if the
134 // pattern matches all hosts (the input was <scheme>://*/<whatever>).
135 const std::string& host() const { return host_; }
136 void SetHost(base::StringPiece host);
137
138 // Gets whether to match subdomains of host().
139 bool match_subdomains() const { return match_subdomains_; }
140 void SetMatchSubdomains(bool val);
141
142 // Gets whether host() contains an effective TLD. If false, during
143 // a match, the URL you're comparing must have its TLD removed
144 // prior to comparison.
145 // e.g. For the match pattern https://google.com/*
146 // If this is true: host() would be google.com
147 // If this is false: host() would be google
148 bool match_effective_tld() const { return match_effective_tld_; }
149 void SetMatchEffectiveTld(bool val);
150
151 // Gets the path the pattern matches with the leading slash. This can have
152 // embedded asterisks which are interpreted using glob rules.
153 const std::string& path() const { return path_; }
154 void SetPath(base::StringPiece path);
155
156 // Returns true if this pattern matches all urls.
157 bool match_all_urls() const { return match_all_urls_; }
158 void SetMatchAllURLs(bool val);
159
160 // Sets the scheme for pattern matches. This can be a single '*' if the
161 // pattern matches all valid schemes (as defined by the valid_schemes_
162 // property). Returns false on failure (if the scheme is not valid).
163 bool SetScheme(base::StringPiece scheme);
164 // Note: You should use MatchesScheme() instead of this getter unless you
165 // absolutely need the exact scheme. This is exposed for testing.
166 const std::string& scheme() const { return scheme_; }
167
168 // Returns true if the specified scheme can be used in this URL pattern, and
169 // false otherwise. Uses valid_schemes_ to determine validity.
170 bool IsValidScheme(base::StringPiece scheme) const;
171
172 // Returns true if this instance matches the specified URL.
173 bool MatchesURL(const GURL& test) const;
174
175 // Returns true if this instance matches the specified security origin.
176 bool MatchesSecurityOrigin(const GURL& test) const;
177
178 // Returns true if |test| matches our scheme.
179 // Note that if test is "filesystem", this may fail whereas MatchesURL
180 // may succeed. MatchesURL is smart enough to look at the inner_url instead
181 // of the outer "filesystem:" part.
182 bool MatchesScheme(base::StringPiece test) const;
183
184 // Returns true if |test| matches our host.
185 bool MatchesHost(base::StringPiece test) const;
186 bool MatchesHost(const GURL& test) const;
187
188 // Returns true if |test| matches our path.
189 bool MatchesPath(base::StringPiece test) const;
190
191 // Returns true if the pattern matches all patterns in an (e)TLD. This
192 // includes patterns like *://*.com/*, *://*.co.uk/*, etc. A pattern that
193 // matches all domains (e.g., *://*/*) will return true.
194 // |private_filter| specifies whether private registries (like appspot.com)
195 // should be considered; if included, patterns like *://*.appspot.com/* will
196 // return true. By default, we exclude private registries (so *.appspot.com
197 // returns false).
198 // Note: This is an expensive method, and should be used sparingly!
199 // You should probably use URLPatternSet::ShouldWarnAllHosts(), which is
200 // cached.
201 bool MatchesEffectiveTld(
202 net::registry_controlled_domains::PrivateRegistryFilter private_filter =
203 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES,
204 net::registry_controlled_domains::UnknownRegistryFilter unknown_filter =
205 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES) const;
206
207 // Returns true if the pattern only matches a single origin. The pattern may
208 // include a path.
209 bool MatchesSingleOrigin() const;
210
211 // Sets the port. Returns false if the port is invalid.
212 bool SetPort(base::StringPiece port);
213 const std::string& port() const { return port_; }
214
215 // Returns a string representing this instance.
216 const std::string& GetAsString() const;
217
218 // Determines whether there is a URL that would match this instance and
219 // another instance. This method is symmetrical: Calling
220 // other.OverlapsWith(this) would result in the same answer.
221 bool OverlapsWith(const URLPattern& other) const;
222
223 // Returns true if this pattern matches all possible URLs that |other| can
224 // match. For example, http://*.google.com encompasses http://www.google.com.
225 bool Contains(const URLPattern& other) const;
226
227 // Creates a new URLPattern that represents the intersection of this
228 // URLPattern with the |other|, or base::nullopt if no intersection exists.
229 // For instance, given the patterns http://*.google.com/* and
230 // *://maps.google.com/*, the intersection is http://maps.google.com/*.
231 // NOTES:
232 // - This will DCHECK if either pattern has match_effective_tld_ set to false.
233 // - Though scheme intersections are supported, the serialization of
234 // URLPatternSet does not record them. Be sure that this is safe for your
235 // use cases.
236 // - Path intersection is done on a best-effort basis. If one path clearly
237 // contains another, it will be handled correctly, but this method does not
238 // deal with cases like /*a* and /*b* (where technically the intersection
239 // is /*a*b*|/*b*a*); the intersection returned for that case will be empty.
240 base::Optional<URLPattern> CreateIntersection(const URLPattern& other) const;
241
242 // Converts this URLPattern into an equivalent set of URLPatterns that don't
243 // use a wildcard in the scheme component. If this URLPattern doesn't use a
244 // wildcard scheme, then the returned set will contain one element that is
245 // equivalent to this instance.
246 std::vector<URLPattern> ConvertToExplicitSchemes() const;
247
248 static bool EffectiveHostCompare(const URLPattern& a, const URLPattern& b) {
249 if (a.match_all_urls_ && b.match_all_urls_)
250 return false;
251 return a.host_.compare(b.host_) < 0;
252 }
253
254 // Used for origin comparisons in a std::set.
255 class EffectiveHostCompareFunctor {
256 public:
257 bool operator()(const URLPattern& a, const URLPattern& b) const {
258 return EffectiveHostCompare(a, b);
259 }
260 };
261
262 // Get an error string for a ParseResult.
263 static const char* GetParseResultString(URLPattern::ParseResult parse_result);
264
265 private:
266 // Returns true if any of the |schemes| items matches our scheme.
267 bool MatchesAnyScheme(const std::vector<std::string>& schemes) const;
268
269 // Returns true if all of the |schemes| items matches our scheme.
270 bool MatchesAllSchemes(const std::vector<std::string>& schemes) const;
271
272 bool MatchesSecurityOriginHelper(const GURL& test) const;
273
274 // Returns true if our port matches the |port| pattern (it may be "*").
275 bool MatchesPortPattern(base::StringPiece port) const;
276
277 // If the URLPattern contains a wildcard scheme, returns a list of
278 // equivalent literal schemes, otherwise returns the current scheme.
279 std::vector<std::string> GetExplicitSchemes() const;
280
281 // A bitmask containing the schemes which are considered valid for this
282 // pattern. Parse() uses this to decide whether a pattern contains a valid
283 // scheme.
284 int valid_schemes_;
285
286 // True if this is a special-case "<all_urls>" pattern.
287 bool match_all_urls_;
288
289 // The scheme for the pattern.
290 std::string scheme_;
291
292 // The host without any leading "*" components.
293 std::string host_;
294
295 // Whether we should match subdomains of the host. This is true if the first
296 // component of the pattern's host was "*".
297 bool match_subdomains_;
298
299 // Whether we should match the effective TLD of the host. This is true by
300 // default and only false if ParseOptions is ALLOW_WILDCARD_FOR_EFFECTIVE_TLD
301 // and is only applicable when the the pattern's host ends with ".*"
302 // (e.g. https://example.*/*).
303 bool match_effective_tld_;
304
305 // The port.
306 std::string port_;
307
308 // The path to match. This is everything after the host of the URL, or
309 // everything after the scheme in the case of file:// URLs.
310 std::string path_;
311
312 // The path with "?" and "\" characters escaped for use with the
313 // MatchPattern() function.
314 std::string path_escaped_;
315
316 // A string representing this URLPattern.
317 mutable std::string spec_;
318};
319
320std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern);
321
322typedef std::vector<URLPattern> URLPatternList;
323
324#endif // EXTENSIONS_COMMON_URL_PATTERN_H_
325