1/* This file is part of the KDE project
2
3 Copyright (C) 2005 Ivor Hewitt <ivor@kde.org>
4 Copyright (C) 2008 Maksim Orlovich <maksim@kde.org>
5 Copyright (C) 2008 Vyacheslav Tokarev <tsjoker@gmail.com>
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public
9 License as published by the Free Software Foundation; either
10 version 2 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public License
18 along with this library; see the file COPYING.LIB. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
21*/
22
23#include "khtml_filter_p.h"
24#include <QDebug>
25
26// rolling hash parameters
27#define HASH_P (1997)
28#define HASH_Q (17509)
29// HASH_MOD = (HASH_P^7) % HASH_Q
30#define HASH_MOD (523)
31
32namespace khtml {
33
34// We only want a subset of features of wildcards -- just the
35// star, so we escape the rest before passing to QRegExp.
36// The \ is escaped due to a QRegExp bug.
37// ### we really should rather parse it ourselves, in order to
38// handle adblock-special things like | and ^ properly.
39static QRegExp fromAdBlockWildcard(const QString& wcStr) {
40 QRegExp rx;
41 rx.setPatternSyntax(QRegExp::Wildcard);
42
43 QString out;
44 for (int p = 0; p < wcStr.length(); ++p) {
45 QChar c = wcStr[p];
46 if (c == QLatin1Char('?'))
47 out += QLatin1String("[?]");
48 else if (c == QLatin1Char('['))
49 out += QLatin1String("[[]");
50 else if (c == QLatin1Char('\\'))
51 out += QLatin1String("[\\]");
52 else
53 out += c;
54 }
55
56 rx.setPattern(out);
57 return rx;
58}
59
60void FilterSet::addFilter(const QString& filterStr)
61{
62 QString filter = filterStr;
63
64 /** ignore special lines starting with "[", "!", "&", or "#" or contain "#" (comments or features are not supported by KHTML's AdBlock */
65 QChar firstChar = filter.at(0);
66 if (firstChar == QLatin1Char('[') || firstChar == QLatin1Char('!') || firstChar == QLatin1Char('&') || firstChar == QLatin1Char('#') || filter.contains(QLatin1Char('#')))
67 return;
68
69 // Strip leading @@
70 int first = 0;
71 int last = filter.length() - 1;
72 if (filter.startsWith(QLatin1String("@@")))
73 first = 2;
74
75 // Strip options, we ignore them for now.
76 int dollar = filter.lastIndexOf(QLatin1Char('$'));
77 if (dollar != -1) {
78 last = dollar - 1;
79 // If only "*" is left after ignoring the options, disregard the rule.
80 if (first == last && firstChar == QLatin1Char('*'))
81 return;
82 }
83
84 // Perhaps nothing left?
85 if (first > last)
86 return;
87
88 filter = filter.mid(first, last - first + 1);
89
90 // Is it a regexp filter?
91 if (filter.length()>2 && filter.startsWith(QLatin1Char('/')) && filter.endsWith(QLatin1Char('/')))
92 {
93 QString inside = filter.mid(1, filter.length()-2);
94 QRegExp rx(inside);
95 reFilters.append(rx);
96// qDebug() << "R:" << inside;
97 }
98 else
99 {
100 // Nope, a wildcard one.
101 // Note: For these, we also need to handle |.
102
103 // Strip wildcards at the ends
104 first = 0;
105 last = filter.length() - 1;
106
107 while (first < filter.length() && filter[first] == QLatin1Char('*'))
108 ++first;
109
110 while (last >= 0 && filter[last] == QLatin1Char('*'))
111 --last;
112
113 if (first > last)
114 filter = QLatin1String("*"); // erm... Well, they asked for it.
115 else
116 filter = filter.mid(first, last - first + 1);
117
118 // Now, do we still have any wildcard stuff left?
119 if (filter.contains("*"))
120 {
121 // check if we can use RK first (and then check full RE for the rest) for better performance
122 int aPos = filter.indexOf('*');
123 if (aPos < 0)
124 aPos = filter.length();
125 if (aPos > 7) {
126 QRegExp rx = fromAdBlockWildcard(filter.mid(aPos) + QLatin1Char('*'));
127 // We pad the final r.e. with * so we can check for an exact match
128 stringFiltersMatcher.addWildedString(filter.mid(0, aPos), rx);
129 } else {
130 QRegExp rx = fromAdBlockWildcard(filter);
131 reFilters.append(rx);
132 }
133 }
134 else
135 {
136 // Fast path
137 stringFiltersMatcher.addString(filter);
138 }
139 }
140}
141
142bool FilterSet::isUrlMatched(const QString& url)
143{
144 if (stringFiltersMatcher.isMatched(url))
145 return true;
146
147 for (int c = 0; c < reFilters.size(); ++c)
148 {
149 if (url.contains(reFilters[c]))
150 return true;
151 }
152
153 return false;
154}
155
156QString FilterSet::urlMatchedBy(const QString& url)
157{
158 QString by;
159
160 if (stringFiltersMatcher.isMatched(url, &by))
161 return by;
162
163 for (int c = 0; c < reFilters.size(); ++c)
164 {
165 if (url.contains(reFilters[c]))
166 {
167 by = reFilters[c].pattern();
168 break;
169 }
170 }
171
172 return by;
173}
174
175void FilterSet::clear()
176{
177 reFilters.clear();
178 stringFiltersMatcher.clear();
179}
180
181
182void StringsMatcher::addString(const QString& pattern)
183{
184 if (pattern.length() < 8) {
185 // handle short string differently
186 shortStringFilters.append(pattern);
187 } else {
188 // use modified Rabin-Karp's algorithm with 8-length string hash
189 // i.e. store hash of first 8 chars in the HashMap for fast look-up
190 stringFilters.append(pattern);
191 int ind = stringFilters.size() - 1;
192 int current = 0;
193
194 // compute hash using rolling hash
195 // hash for string: x0,x1,x2...xn-1 will be:
196 // (p^(n-1)*x0 + p^(n-2)*x1 + ... + p * xn-2 + xn-1) % q
197 // where p and q some wisely-chosen integers
198 /*for (int k = 0; k < 8; ++k)*/
199 int len = pattern.length();
200 for (int k = len - 8; k < len; ++k)
201 current = (current * HASH_P + pattern[k].unicode()) % HASH_Q;
202
203 // insert computed hash value into HashMap
204 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
205 if (it == stringFiltersHash.end()) {
206 QVector<int> list;
207 list.append(ind);
208 stringFiltersHash.add(current + 1, list);
209 fastLookUp.setBit(current);
210 } else {
211 it->second.append(ind);
212 }
213 }
214}
215
216void StringsMatcher::addWildedString(const QString& prefix, const QRegExp& rx)
217{
218 rePrefixes.append(prefix);
219 reFilters.append(rx);
220 int index = -rePrefixes.size();
221
222 int current = 0;
223 for (int k = 0; k < 8; ++k)
224 current = (current * HASH_P + prefix[k].unicode()) % HASH_Q;
225
226 // insert computed hash value into HashMap
227 WTF::HashMap<int, QVector<int> >::iterator it = stringFiltersHash.find(current + 1);
228 if (it == stringFiltersHash.end()) {
229 QVector<int> list;
230 list.append(index);
231 stringFiltersHash.add(current + 1, list);
232 fastLookUp.setBit(current);
233 } else {
234 it->second.append(index);
235 }
236}
237
238bool StringsMatcher::isMatched(const QString& str, QString *by) const
239{
240 // check short strings first
241 for (int i = 0; i < shortStringFilters.size(); ++i) {
242 if (str.contains(shortStringFilters[i]))
243 {
244 if (by != 0) *by = shortStringFilters[i];
245 return true;
246 }
247 }
248
249 int len = str.length();
250 int k;
251
252 int current = 0;
253 int next = 0;
254 // compute hash for first 8 characters
255 for (k = 0; k < 8 && k < len; ++k)
256 current = (current * HASH_P + str[k].unicode()) % HASH_Q;
257
258 WTF::HashMap<int, QVector<int> >::const_iterator hashEnd = stringFiltersHash.end();
259 // main Rabin-Karp's algorithm loop
260 for (k = 7; k < len; ++k, current = next) {
261 // roll the hash if not at the end
262 // (calculate hash for the next iteration)
263 if (k + 1 < len)
264 next = (HASH_P * ((current + HASH_Q - ((HASH_MOD * str[k - 7].unicode()) % HASH_Q)) % HASH_Q) + str[k + 1].unicode()) % HASH_Q;
265
266 if (!fastLookUp.testBit(current))
267 continue;
268
269 // look-up the hash in the HashMap and check all strings
270 WTF::HashMap<int, QVector<int> >::const_iterator it = stringFiltersHash.find(current + 1);
271
272 // check possible strings
273 if (it != hashEnd) {
274 for (int j = 0; j < it->second.size(); ++j) {
275 int index = it->second[j];
276 // check if we got simple string or REs prefix
277 if (index >= 0) {
278 int flen = stringFilters[index].length();
279 if (k - flen + 1 >= 0 && stringFilters[index] == str.midRef(k - flen + 1 , flen))
280 {
281 if (by != 0) *by = stringFilters[index];
282 return true;
283 }
284 } else {
285 index = -index - 1;
286 int flen = rePrefixes[index].length();
287 if (k - 8 + flen < len && rePrefixes[index] == str.midRef(k - 7, flen))
288 {
289 int remStart = k - 7 + flen;
290 QString remainder = QString::fromRawData(str.unicode() + remStart,
291 str.length() - remStart);
292 if (reFilters[index].exactMatch(remainder)) {
293 if (by != 0) *by = rePrefixes[index]+reFilters[index].pattern();
294 return true;
295 }
296 }
297 }
298 }
299 }
300 }
301
302 return false;
303}
304
305void StringsMatcher::clear()
306{
307 stringFilters.clear();
308 shortStringFilters.clear();
309 reFilters.clear();
310 rePrefixes.clear();
311 stringFiltersHash.clear();
312 fastLookUp.resize(HASH_Q);
313 fastLookUp.fill(0, 0, HASH_Q);
314}
315
316}
317
318// kate: indent-width 4; replace-tabs on; tab-width 4; space-indent on;
319