1 | /* This file is part of the KDE project |
2 | |
3 | Copyright (C) 2005 Ivor Hewitt <ivor@kde.org> |
4 | Copyright (C) 2008 Maksim Orlovich <maksim@kde.org> |
5 | Copyright (C) 2008 Vyacheslav Tokarev <tsjoker@gmail.com> |
6 | |
7 | This library is free software; you can redistribute it and/or |
8 | modify it under the terms of the GNU Library General Public |
9 | License as published by the Free Software Foundation; either |
10 | version 2 of the License, or (at your option) any later version. |
11 | |
12 | This library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | Library General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU Library General Public License |
18 | along with this library; see the file COPYING.LIB. If not, write to |
19 | the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
20 | Boston, MA 02110-1301, USA. |
21 | */ |
22 | |
23 | #include "khtml_filter_p.h" |
24 | #include <QDebug> |
25 | |
26 | // rolling hash parameters |
27 | #define HASH_P (1997) |
28 | #define HASH_Q (17509) |
29 | // HASH_MOD = (HASH_P^7) % HASH_Q |
30 | #define HASH_MOD (523) |
31 | |
32 | namespace khtml { |
33 | |
34 | // We only want a subset of features of wildcards -- just the |
35 | // star, so we escape the rest before passing to QRegExp. |
36 | // The \ is escaped due to a QRegExp bug. |
37 | // ### we really should rather parse it ourselves, in order to |
38 | // handle adblock-special things like | and ^ properly. |
39 | static QRegExp fromAdBlockWildcard(const QString& wcStr) { |
40 | QRegExp rx; |
41 | rx.setPatternSyntax(QRegExp::Wildcard); |
42 | |
43 | QString out; |
44 | for (int p = 0; p < wcStr.length(); ++p) { |
45 | QChar c = wcStr[p]; |
46 | if (c == QLatin1Char('?')) |
47 | out += QLatin1String("[?]" ); |
48 | else if (c == QLatin1Char('[')) |
49 | out += QLatin1String("[[]" ); |
50 | else if (c == QLatin1Char('\\')) |
51 | out += QLatin1String("[\\]" ); |
52 | else |
53 | out += c; |
54 | } |
55 | |
56 | rx.setPattern(out); |
57 | return rx; |
58 | } |
59 | |
60 | void FilterSet::addFilter(const QString& filterStr) |
61 | { |
62 | QString filter = filterStr; |
63 | |
64 | /** ignore special lines starting with "[", "!", "&", or "#" or contain "#" (comments or features are not supported by KHTML's AdBlock */ |
65 | QChar firstChar = filter.at(0); |
66 | if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('&') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#'))) |
67 | return; |
68 | |
69 | // Strip leading @@ |
70 | int first = 0; |
71 | int last = filter.length() - 1; |
72 | if (filter.startsWith(QLatin1String("@@" ))) |
73 | first = 2; |
74 | |
75 | // Strip options, we ignore them for now. |
76 | int dollar = filter.lastIndexOf(QLatin1Char('$')); |
77 | if (dollar != -1) { |
78 | last = dollar - 1; |
79 | // If only "*" is left after ignoring the options, disregard the rule. |
80 | if (first == last && firstChar == QLatin1Char('*')) |
81 | return; |
82 | } |
83 | |
84 | // Perhaps nothing left? |
85 | if (first > last) |
86 | return; |
87 | |
88 | filter = filter.mid(first, last - first + 1); |
89 | |
90 | // Is it a regexp filter? |
91 | if (filter.length()>2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/'))) |
92 | { |
93 | QString inside = filter.mid(1, filter.length()-2); |
94 | QRegExp rx(inside); |
95 | reFilters.append(rx); |
96 | // qDebug() << "R:" << inside; |
97 | } |
98 | else |
99 | { |
100 | // Nope, a wildcard one. |
101 | // Note: For these, we also need to handle |. |
102 | |
103 | // Strip wildcards at the ends |
104 | first = 0; |
105 | last = filter.length() - 1; |
106 | |
107 | while (first < filter.length() && filter[first] == QLatin1Char('*')) |
108 | ++first; |
109 | |
110 | while (last >= 0 && filter[last] == QLatin1Char('*')) |
111 | --last; |
112 | |
113 | if (first > last) |
114 | filter = QLatin1String("*" ); // erm... Well, they asked for it. |
115 | else |
116 | filter = filter.mid(first, last - first + 1); |
117 | |
118 | // Now, do we still have any wildcard stuff left? |
119 | if (filter.contains("*" )) |
120 | { |
121 | // check if we can use RK first (and then check full RE for the rest) for better performance |
122 | int aPos = filter.indexOf('*'); |
123 | if (aPos < 0) |
124 | aPos = filter.length(); |
125 | if (aPos > 7) { |
126 | QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*')); |
127 | // We pad the final r.e. with * so we can check for an exact match |
128 | stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx); |
129 | } else { |
130 | QRegExp rx = fromAdBlockWildcard(filter); |
131 | reFilters.append(rx); |
132 | } |
133 | } |
134 | else |
135 | { |
136 | // Fast path |
137 | stringFiltersMatcher.addString(filter); |
138 | } |
139 | } |
140 | } |
141 | |
142 | bool FilterSet::isUrlMatched(const QString& url) |
143 | { |
144 | if (stringFiltersMatcher.isMatched(url)) |
145 | return true; |
146 | |
147 | for (int c = 0; c < reFilters.size(); ++c) |
148 | { |
149 | if (url.contains(reFilters[c])) |
150 | return true; |
151 | } |
152 | |
153 | return false; |
154 | } |
155 | |
156 | QString FilterSet::urlMatchedBy(const QString& url) |
157 | { |
158 | QString by; |
159 | |
160 | if (stringFiltersMatcher.isMatched(url, &by)) |
161 | return by; |
162 | |
163 | for (int c = 0; c < reFilters.size(); ++c) |
164 | { |
165 | if (url.contains(reFilters[c])) |
166 | { |
167 | by = reFilters[c].pattern(); |
168 | break; |
169 | } |
170 | } |
171 | |
172 | return by; |
173 | } |
174 | |
175 | void FilterSet::clear() |
176 | { |
177 | reFilters.clear(); |
178 | stringFiltersMatcher.clear(); |
179 | } |
180 | |
181 | |
182 | void StringsMatcher::addString(const QString& pattern) |
183 | { |
184 | if (pattern.length() < 8) { |
185 | // handle short string differently |
186 | shortStringFilters.append(pattern); |
187 | } else { |
188 | // use modified Rabin-Karp's algorithm with 8-length string hash |
189 | // i.e. store hash of first 8 chars in the HashMap for fast look-up |
190 | stringFilters.append(pattern); |
191 | int ind = stringFilters.size() - 1; |
192 | int current = 0; |
193 | |
194 | // compute hash using rolling hash |
195 | // hash for string: x0,x1,x2...xn-1 will be: |
196 | // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q |
197 | // where p and q some wisely-chosen integers |
198 | /*for (int k = 0; k < 8; ++k)*/ |
199 | int len = pattern.length(); |
200 | for (int k = len - 8; k < len; ++k) |
201 | current = (current * HASH_P + pattern[k].unicode()) % HASH_Q; |
202 | |
203 | // insert computed hash value into HashMap |
204 | WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); |
205 | if (it == stringFiltersHash.end()) { |
206 | QVector<int> list; |
207 | list.append(ind); |
208 | stringFiltersHash.add(current + 1, list); |
209 | fastLookUp.setBit(current); |
210 | } else { |
211 | it->second.append(ind); |
212 | } |
213 | } |
214 | } |
215 | |
216 | void StringsMatcher::addWildedString(const QString& prefix, const QRegExp& rx) |
217 | { |
218 | rePrefixes.append(prefix); |
219 | reFilters.append(rx); |
220 | int index = -rePrefixes.size(); |
221 | |
222 | int current = 0; |
223 | for (int k = 0; k < 8; ++k) |
224 | current = (current * HASH_P + prefix[k].unicode()) % HASH_Q; |
225 | |
226 | // insert computed hash value into HashMap |
227 | WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1); |
228 | if (it == stringFiltersHash.end()) { |
229 | QVector<int> list; |
230 | list.append(index); |
231 | stringFiltersHash.add(current + 1, list); |
232 | fastLookUp.setBit(current); |
233 | } else { |
234 | it->second.append(index); |
235 | } |
236 | } |
237 | |
238 | bool StringsMatcher::isMatched(const QString& str, QString *by) const |
239 | { |
240 | // check short strings first |
241 | for (int i = 0; i < shortStringFilters.size(); ++i) { |
242 | if (str.contains(shortStringFilters[i])) |
243 | { |
244 | if (by != 0) *by = shortStringFilters[i]; |
245 | return true; |
246 | } |
247 | } |
248 | |
249 | int len = str.length(); |
250 | int k; |
251 | |
252 | int current = 0; |
253 | int next = 0; |
254 | // compute hash for first 8 characters |
255 | for (k = 0; k < 8 && k < len; ++k) |
256 | current = (current * HASH_P + str[k].unicode()) % HASH_Q; |
257 | |
258 | WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end(); |
259 | // main Rabin-Karp's algorithm loop |
260 | for (k = 7; k < len; ++k, current = next) { |
261 | // roll the hash if not at the end |
262 | // (calculate hash for the next iteration) |
263 | if (k + 1 < len) |
264 | next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q; |
265 | |
266 | if (!fastLookUp.testBit(current)) |
267 | continue; |
268 | |
269 | // look-up the hash in the HashMap and check all strings |
270 | WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1); |
271 | |
272 | // check possible strings |
273 | if (it != hashEnd) { |
274 | for (int j = 0; j < it->second.size(); ++j) { |
275 | int index = it->second[j]; |
276 | // check if we got simple string or REs prefix |
277 | if (index >= 0) { |
278 | int flen = stringFilters[index].length(); |
279 | if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1 , flen)) |
280 | { |
281 | if (by != 0) *by = stringFilters[index]; |
282 | return true; |
283 | } |
284 | } else { |
285 | index = -index - 1; |
286 | int flen = rePrefixes[index].length(); |
287 | if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen)) |
288 | { |
289 | int remStart = k - 7 + flen; |
290 | QString remainder = QString::fromRawData(str.unicode() + remStart, |
291 | str.length() - remStart); |
292 | if (reFilters[index].exactMatch(remainder)) { |
293 | if (by != 0) *by = rePrefixes[index]+reFilters[index].pattern(); |
294 | return true; |
295 | } |
296 | } |
297 | } |
298 | } |
299 | } |
300 | } |
301 | |
302 | return false; |
303 | } |
304 | |
305 | void StringsMatcher::clear() |
306 | { |
307 | stringFilters.clear(); |
308 | shortStringFilters.clear(); |
309 | reFilters.clear(); |
310 | rePrefixes.clear(); |
311 | stringFiltersHash.clear(); |
312 | fastLookUp.resize(HASH_Q); |
313 | fastLookUp.fill(0, 0, HASH_Q); |
314 | } |
315 | |
316 | } |
317 | |
318 | // kate: indent-width 4; replace-tabs on; tab-width 4; space-indent on; |
319 | |