1 | /*------------------------------------------------------------------------------ |
2 | * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team |
3 | * |
4 | * Distributable under the terms of either the Apache License (Version 2.0) or |
5 | * the GNU Lesser General Public License, as specified in the COPYING file. |
6 | ------------------------------------------------------------------------------*/ |
7 | #ifndef _lucene_queryParser_QueryParser_ |
8 | #define _lucene_queryParser_QueryParser_ |
9 | |
10 | #include "CLucene/util/Array.h" |
11 | #include "QueryParserTokenManager.h" |
12 | #include "CLucene/document/DateTools.h" |
13 | #include "CLucene/util/VoidMap.h" |
14 | #include "CLucene/util/VoidList.h" |
15 | |
16 | CL_CLASS_DEF(index,Term) |
17 | CL_CLASS_DEF(analysis,Analyzer) |
18 | CL_CLASS_DEF(search,Query) |
19 | CL_CLASS_DEF(search,BooleanClause) |
20 | |
21 | CL_NS_DEF(queryParser) |
22 | |
23 | class QueryParserConstants; |
24 | |
25 | /** |
26 | * This class is generated by JavaCC. The most important method is |
27 | * {@link #parse(String)}. |
28 | * |
29 | * The syntax for query strings is as follows: |
30 | * A Query is a series of clauses. |
31 | * A clause may be prefixed by: |
32 | * <ul> |
33 | * <li> a plus (<code>+</code>) or a minus (<code>-</code>) sign, indicating |
34 | * that the clause is required or prohibited respectively; or |
35 | * <li> a term followed by a colon, indicating the field to be searched. |
36 | * This enables one to construct queries which search multiple fields. |
37 | * </ul> |
38 | * |
39 | * A clause may be either: |
40 | * <ul> |
41 | * <li> a term, indicating all the documents that contain this term; or |
42 | * <li> a nested query, enclosed in parentheses. Note that this may be used |
43 | * with a <code>+</code>/<code>-</code> prefix to require any of a set of |
44 | * terms. |
45 | * </ul> |
46 | * |
47 | * Thus, in BNF, the query grammar is: |
48 | * <pre> |
49 | * Query ::= ( Clause )* |
50 | * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" ) |
51 | * </pre> |
52 | * |
53 | * <p> |
54 | * Examples of appropriately formatted queries can be found in the <a |
55 | * href="http://lucene.apache.org/java/docs/queryparsersyntax.html">query syntax |
56 | * documentation</a>. |
57 | * </p> |
58 | * |
59 | * <p> |
60 | * In {@link RangeQuery}s, QueryParser tries to detect date values, e.g. |
61 | * <tt>date:[6/1/2005 TO 6/4/2005]</tt> produces a range query that searches |
62 | * for "date" fields between 2005-06-01 and 2005-06-04. Note that the format |
63 | * of the accepted input depends on {@link #setLocale(Locale) the locale}. |
64 | * By default a date is converted into a search term using the deprecated |
65 | * {@link DateField} for compatibility reasons. |
66 | * To use the new {@link DateTools} to convert dates, a |
67 | * {@link org.apache.lucene.document.DateTools.Resolution} has to be set. |
68 | * </p> |
69 | * <p> |
70 | * The date resolution that shall be used for RangeQueries can be set |
71 | * using {@link #setDateResolution(DateTools.Resolution)} |
72 | * or {@link #setDateResolution(String, DateTools.Resolution)}. The former |
73 | * sets the default date resolution for all fields, whereas the latter can |
74 | * be used to set field specific date resolutions. Field specific date |
75 | * resolutions take, if set, precedence over the default date resolution. |
76 | * </p> |
77 | * <p> |
78 | * If you use neither {@link DateField} nor {@link DateTools} in your |
79 | * index, you can create your own |
80 | * query parser that inherits QueryParser and overwrites |
81 | * {@link #getRangeQuery(String, String, String, boolean)} to |
82 | * use a different method for date conversion. |
83 | * </p> |
84 | * |
85 | * <p>Note that QueryParser is <em>not</em> thread-safe.</p> |
86 | * |
87 | * @author Brian Goetz |
88 | * @author Peter Halacsy |
89 | * @author Tatu Saloranta |
90 | */ |
91 | class CLUCENE_EXPORT QueryParser : public virtual QueryParserConstants |
92 | { |
93 | private: |
94 | LUCENE_STATIC_CONSTANT(int32_t, CONJ_NONE=0); |
95 | LUCENE_STATIC_CONSTANT(int32_t, CONJ_AND=1); |
96 | LUCENE_STATIC_CONSTANT(int32_t, CONJ_OR=2); |
97 | |
98 | LUCENE_STATIC_CONSTANT(int32_t, MOD_NONE=0); |
99 | LUCENE_STATIC_CONSTANT(int32_t, MOD_NOT=10); |
100 | LUCENE_STATIC_CONSTANT(int32_t, MOD_REQ=11); |
101 | |
102 | public: |
103 | /** The default operator for parsing queries. |
104 | * Use {@link QueryParser#setDefaultOperator} to change it. |
105 | */ |
106 | enum Operator { |
107 | OR_OPERATOR, |
108 | AND_OPERATOR |
109 | }; |
110 | |
111 | private: |
112 | /** The actual operator that parser uses to combine query terms */ |
113 | Operator _operator; |
114 | |
115 | bool lowercaseExpandedTerms; |
116 | bool useOldRangeQuery; |
117 | bool allowLeadingWildcard; |
118 | bool enablePositionIncrements; |
119 | |
120 | CL_NS(analysis)::Analyzer* analyzer; |
121 | TCHAR* field; |
122 | int32_t phraseSlop; |
123 | float_t fuzzyMinSim; |
124 | int32_t fuzzyPrefixLength; |
125 | //TODO: Locale locale = Locale.getDefault(); |
126 | |
127 | // the default date resolution |
128 | CL_NS(document)::DateTools::Resolution dateResolution; |
129 | // maps field names to date resolutions |
130 | typedef CL_NS(util)::CLHashMap<const TCHAR*, |
131 | CL_NS(document)::DateTools::Resolution, |
132 | CL_NS(util)::Compare::TChar, |
133 | CL_NS(util)::Equals::TChar, |
134 | CL_NS(util)::Deletor::Dummy, |
135 | CL_NS(util)::Deletor::DummyInt32 |
136 | > FieldToDateResolutionType; |
137 | FieldToDateResolutionType* fieldToDateResolution; |
138 | |
139 | public: |
140 | /** Constructs a query parser. |
141 | * @param f the default field for query terms. |
142 | * @param a used to find terms in the query text. |
143 | */ |
144 | QueryParser(const TCHAR* f, CL_NS(analysis)::Analyzer* a); |
145 | virtual ~QueryParser(); |
146 | void _deleteTokens(); |
147 | |
148 | /** For backward compatibility */ |
149 | static CL_NS(search)::Query* parse(const TCHAR* q, const TCHAR* f, CL_NS(analysis)::Analyzer* a); |
150 | |
151 | /** Parses a query string, returning a {@link org.apache.lucene.search.Query}. |
152 | * @param query the query string to be parsed. |
153 | * @throws ParseException if the parsing fails |
154 | */ |
155 | CL_NS(search)::Query* parse(const TCHAR* _query); |
156 | |
157 | /** |
158 | * @return Returns the analyzer. |
159 | */ |
160 | CL_NS(analysis)::Analyzer* getAnalyzer() const; |
161 | |
162 | /** |
163 | * @return Returns the field. |
164 | */ |
165 | const TCHAR* getField() const; |
166 | |
167 | /** |
168 | * Get the minimal similarity for fuzzy queries. |
169 | */ |
170 | float_t getFuzzyMinSim() const; |
171 | |
172 | /** |
173 | * Set the minimum similarity for fuzzy queries. |
174 | * Default is 0.5f. |
175 | */ |
176 | void setFuzzyMinSim(const float_t _fuzzyMinSim); |
177 | |
178 | /** |
179 | * Get the prefix length for fuzzy queries. |
180 | * @return Returns the fuzzyPrefixLength. |
181 | */ |
182 | int32_t getFuzzyPrefixLength() const; |
183 | |
184 | /** |
185 | * Set the prefix length for fuzzy queries. Default is 0. |
186 | * @param fuzzyPrefixLength The fuzzyPrefixLength to set. |
187 | */ |
188 | void setFuzzyPrefixLength(const int32_t _fuzzyPrefixLength); |
189 | |
190 | /** |
191 | * Sets the default slop for phrases. If zero, then exact phrase matches |
192 | * are required. Default value is zero. |
193 | */ |
194 | void setPhraseSlop(const int32_t _phraseSlop); |
195 | |
196 | /** |
197 | * Gets the default slop for phrases. |
198 | */ |
199 | int32_t getPhraseSlop() const; |
200 | |
201 | /** |
202 | * Set to <code>true</code> to allow leading wildcard characters. |
203 | * <p> |
204 | * When set, <code>*</code> or <code>?</code> are allowed as |
205 | * the first character of a PrefixQuery and WildcardQuery. |
206 | * Note that this can produce very slow |
207 | * queries on big indexes. |
208 | * <p> |
209 | * Default: false. |
210 | */ |
211 | void setAllowLeadingWildcard(const bool _allowLeadingWildcard); |
212 | |
213 | /** |
214 | * @see #setAllowLeadingWildcard(boolean) |
215 | */ |
216 | bool getAllowLeadingWildcard() const; |
217 | |
218 | /** |
219 | * Set to <code>true</code> to enable position increments in result query. |
220 | * <p> |
221 | * When set, result phrase and multi-phrase queries will |
222 | * be aware of position increments. |
223 | * Useful when e.g. a StopFilter increases the position increment of |
224 | * the token that follows an omitted token. |
225 | * <p> |
226 | * Default: false. |
227 | */ |
228 | void setEnablePositionIncrements(const bool _enable); |
229 | |
230 | /** |
231 | * @see #setEnablePositionIncrements(boolean) |
232 | */ |
233 | bool getEnablePositionIncrements() const; |
234 | |
235 | /** |
236 | * Sets the boolean operator of the QueryParser. |
237 | * In default mode (<code>OR_OPERATOR</code>) terms without any modifiers |
238 | * are considered optional: for example <code>capital of Hungary</code> is equal to |
239 | * <code>capital OR of OR Hungary</code>.<br/> |
240 | * In <code>AND_OPERATOR</code> mode terms are considered to be in conjuction: the |
241 | * above mentioned query is parsed as <code>capital AND of AND Hungary</code> |
242 | */ |
243 | void setDefaultOperator(Operator _op); |
244 | |
245 | /** |
246 | * Gets implicit operator setting, which will be either AND_OPERATOR |
247 | * or OR_OPERATOR. |
248 | */ |
249 | Operator getDefaultOperator() const; |
250 | |
251 | /** |
252 | * Whether terms of wildcard, prefix, fuzzy and range queries are to be automatically |
253 | * lower-cased or not. Default is <code>true</code>. |
254 | */ |
255 | void setLowercaseExpandedTerms(const bool _lowercaseExpandedTerms); |
256 | |
257 | /** |
258 | * @see #setLowercaseExpandedTerms(boolean) |
259 | */ |
260 | bool getLowercaseExpandedTerms() const; |
261 | |
262 | /** |
263 | * By default QueryParser uses new ConstantScoreRangeQuery in preference to RangeQuery |
264 | * for range queries. This implementation is generally preferable because it |
265 | * a) Runs faster b) Does not have the scarcity of range terms unduly influence score |
266 | * c) avoids any "TooManyBooleanClauses" exception. |
267 | * However, if your application really needs to use the old-fashioned RangeQuery and the above |
268 | * points are not required then set this option to <code>true</code> |
269 | * Default is <code>false</code>. |
270 | */ |
271 | void setUseOldRangeQuery(const bool _useOldRangeQuery); |
272 | |
273 | /** |
274 | * @see #setUseOldRangeQuery(boolean) |
275 | */ |
276 | bool getUseOldRangeQuery() const; |
277 | |
278 | /** |
279 | * Set locale used by date range parsing. |
280 | * |
281 | void setLocale(const Locale _locale) { |
282 | locale = _locale; |
283 | } |
284 | |
285 | |
286 | * Returns current locale, allowing access by subclasses. |
287 | * |
288 | Locale getLocale() const { |
289 | return locale; |
290 | } |
291 | */ |
292 | |
293 | /** |
294 | * Sets the default date resolution used by RangeQueries for fields for which no |
295 | * specific date resolutions has been set. Field specific resolutions can be set |
296 | * with {@link #setDateResolution(String, DateTools.Resolution)}. |
297 | * |
298 | * @param dateResolution the default date resolution to set |
299 | */ |
300 | void setDateResolution(const CL_NS(document)::DateTools::Resolution _dateResolution); |
301 | |
302 | /** |
303 | * Sets the date resolution used by RangeQueries for a specific field. |
304 | * |
305 | * @param fieldName field for which the date resolution is to be set |
306 | * @param dateResolution date resolution to set |
307 | */ |
308 | void setDateResolution(const TCHAR* fieldName, const CL_NS(document)::DateTools::Resolution _dateResolution); |
309 | |
310 | /** |
311 | * Returns the date resolution that is used by RangeQueries for the given field. |
312 | * Returns null (NO_RESOLUTION), if no default or field specific date resolution has been set |
313 | * for the given field. |
314 | * |
315 | */ |
316 | CL_NS(document)::DateTools::Resolution getDateResolution(const TCHAR* fieldName) const; |
317 | |
318 | protected: |
319 | void addClause(std::vector<CL_NS(search)::BooleanClause*>& clauses, int32_t conj, int32_t mods, CL_NS(search)::Query* q); |
320 | |
321 | /** |
322 | * @exception ParseException throw in overridden method to disallow |
323 | */ |
324 | virtual CL_NS(search)::Query* getFieldQuery(const TCHAR* _field, TCHAR* queryText); |
325 | |
326 | /** |
327 | * Base implementation delegates to {@link #getFieldQuery(String,String)}. |
328 | * This method may be overridden, for example, to return |
329 | * a SpanNearQuery instead of a PhraseQuery. |
330 | * |
331 | * @exception ParseException throw in overridden method to disallow |
332 | */ |
333 | virtual CL_NS(search)::Query* getFieldQuery(const TCHAR* _field, TCHAR* queryText, const int32_t slop); |
334 | |
335 | /** |
336 | * @exception ParseException throw in overridden method to disallow |
337 | */ |
338 | virtual CL_NS(search)::Query* getRangeQuery(const TCHAR* field, TCHAR* part1, TCHAR* part2, const bool inclusive); |
339 | |
340 | /** |
341 | * Factory method for generating query, given a set of clauses. |
342 | * By default creates a boolean query composed of clauses passed in. |
343 | * |
344 | * Can be overridden by extending classes, to modify query being |
345 | * returned. |
346 | * |
347 | * @param clauses Vector that contains {@link BooleanClause} instances |
348 | * to join. |
349 | * @param disableCoord true if coord scoring should be disabled. |
350 | * |
351 | * @return Resulting {@link Query} object. |
352 | * @exception ParseException throw in overridden method to disallow |
353 | */ |
354 | CL_NS(search)::Query* getBooleanQuery(std::vector<CL_NS(search)::BooleanClause*>& clauses, bool disableCoord = false); |
355 | |
356 | /** |
357 | * Factory method for generating a query. Called when parser |
358 | * parses an input term token that contains one or more wildcard |
359 | * characters (? and *), but is not a prefix term token (one |
360 | * that has just a single * character at the end) |
361 | *<p> |
362 | * Depending on settings, prefix term may be lower-cased |
363 | * automatically. It will not go through the default Analyzer, |
364 | * however, since normal Analyzers are unlikely to work properly |
365 | * with wildcard templates. |
366 | *<p> |
367 | * Can be overridden by extending classes, to provide custom handling for |
368 | * wildcard queries, which may be necessary due to missing analyzer calls. |
369 | * |
370 | * @param field Name of the field query will use. |
371 | * @param termStr Term token that contains one or more wild card |
372 | * characters (? or *), but is not simple prefix term |
373 | * |
374 | * @return Resulting {@link Query} built for the term |
375 | * @exception ParseException throw in overridden method to disallow |
376 | */ |
377 | virtual CL_NS(search)::Query* getWildcardQuery(const TCHAR* _field, TCHAR* termStr); |
378 | |
379 | /** |
380 | * Factory method for generating a query (similar to |
381 | * {@link #getWildcardQuery}). Called when parser parses an input term |
382 | * token that uses prefix notation; that is, contains a single '*' wildcard |
383 | * character as its last character. Since this is a special case |
384 | * of generic wildcard term, and such a query can be optimized easily, |
385 | * this usually results in a different query object. |
386 | *<p> |
387 | * Depending on settings, a prefix term may be lower-cased |
388 | * automatically. It will not go through the default Analyzer, |
389 | * however, since normal Analyzers are unlikely to work properly |
390 | * with wildcard templates. |
391 | *<p> |
392 | * Can be overridden by extending classes, to provide custom handling for |
393 | * wild card queries, which may be necessary due to missing analyzer calls. |
394 | * |
395 | * @param field Name of the field query will use. |
396 | * @param termStr Term token to use for building term for the query |
397 | * (<b>without</b> trailing '*' character!) |
398 | * |
399 | * @return Resulting {@link Query} built for the term |
400 | * @exception ParseException throw in overridden method to disallow |
401 | */ |
402 | virtual CL_NS(search)::Query* getPrefixQuery(const TCHAR* _field, TCHAR* _termStr); |
403 | |
404 | /** |
405 | * Factory method for generating a query (similar to |
406 | * {@link #getWildcardQuery}). Called when parser parses |
407 | * an input term token that has the fuzzy suffix (~) appended. |
408 | * |
409 | * @param field Name of the field query will use. |
410 | * @param termStr Term token to use for building term for the query |
411 | * |
412 | * @return Resulting {@link Query} built for the term |
413 | * @exception ParseException throw in overridden method to disallow |
414 | */ |
415 | virtual CL_NS(search)::Query* getFuzzyQuery(const TCHAR* _field, TCHAR* termStr, const float_t minSimilarity); |
416 | |
417 | private: |
418 | /** |
419 | * Returns a String where the escape char has been |
420 | * removed, or kept only once if there was a double escape. |
421 | * |
422 | * Supports escaped unicode characters, e. g. translates |
423 | * <code>A</code> to <code>A</code>. |
424 | * |
425 | * @memory caller is responsible to free the returned string |
426 | * |
427 | */ |
428 | TCHAR* discardEscapeChar(TCHAR* input, TCHAR* output=NULL); |
429 | |
430 | /** Returns the numeric value of the hexadecimal character */ |
431 | static int32_t hexToInt(TCHAR c); |
432 | |
433 | struct JJCalls; |
434 | |
435 | public: |
436 | /** |
437 | * Returns a String where those characters that QueryParser |
438 | * expects to be escaped are escaped by a preceding <code>\</code>. |
439 | * |
440 | * @memory caller is responsible to free the returned string |
441 | */ |
442 | static TCHAR* escape(const TCHAR* s); |
443 | |
444 | // * Query ::= ( Clause )* |
445 | // * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" ) |
446 | int32_t Conjunction(); |
447 | |
448 | int32_t Modifiers(); |
449 | |
450 | // This makes sure that there is no garbage after the query string |
451 | CL_NS(search)::Query* TopLevelQuery(TCHAR* _field); |
452 | |
453 | CL_NS(search)::Query* fQuery(TCHAR* _field); |
454 | |
455 | CL_NS(search)::Query* fClause(TCHAR* _field); |
456 | |
457 | public: |
458 | CL_NS(search)::Query* fTerm(const TCHAR* _field); |
459 | |
460 | private: |
461 | bool jj_2_1(const int32_t xla); |
462 | bool jj_3R_2(); |
463 | bool jj_3_1(); |
464 | bool jj_3R_3(); |
465 | |
466 | public: |
467 | QueryParserTokenManager* token_source; |
468 | QueryToken *token, *jj_nt; |
469 | private: |
470 | QueryToken *_firstToken; |
471 | int32_t jj_ntk; |
472 | QueryToken *jj_scanpos, *jj_lastpos; |
473 | int32_t jj_la; |
474 | public: |
475 | bool lookingAhead; |
476 | private: |
477 | bool jj_semLA; |
478 | int32_t jj_gen; |
479 | int32_t jj_la1[23]; |
480 | static const int32_t jj_la1_0[]; |
481 | static const int32_t jj_la1_1[]; |
482 | JJCalls* jj_2_rtns; |
483 | bool jj_rescan; |
484 | int32_t jj_gc; |
485 | |
486 | public: |
487 | QueryParser(CharStream* stream); |
488 | void ReInit(CharStream* stream); |
489 | QueryParser(QueryParserTokenManager* tm); |
490 | void ReInit(QueryParserTokenManager* tm); |
491 | |
492 | private: |
493 | void _init(CharStream* stream); |
494 | QueryToken* jj_consume_token(const int32_t kind); |
495 | bool jj_scan_token(const int32_t kind); |
496 | |
497 | public: |
498 | QueryToken* getNextToken(); |
499 | QueryToken* getToken(int32_t index); |
500 | |
501 | private: |
502 | int32_t f_jj_ntk(); |
503 | |
504 | CL_NS(util)::CLVector< CL_NS(util)::ValueArray<int32_t>*, |
505 | CL_NS(util)::Deletor::Object< CL_NS(util)::ValueArray<int32_t> > |
506 | >* jj_expentries; |
507 | CL_NS(util)::ValueArray<int32_t>* jj_expentry; |
508 | int32_t jj_kind; |
509 | int32_t jj_lasttokens[100]; |
510 | int32_t jj_endpos; |
511 | |
512 | void jj_add_error_token(const int32_t kind, int32_t pos); |
513 | |
514 | public: |
515 | void generateParseException(); |
516 | |
517 | //void enable_tracing() {} |
518 | //void disable_tracing() {} |
519 | |
520 | private: |
521 | void jj_rescan_token(); |
522 | void jj_save(const int32_t index, int32_t xla); |
523 | |
524 | TCHAR* getParseExceptionMessage(QueryToken* currentToken, |
525 | CL_NS(util)::CLVector< CL_NS(util)::ValueArray<int32_t>*, |
526 | CL_NS(util)::Deletor::Object< CL_NS(util)::ValueArray<int32_t> > >* expectedTokenSequences, |
527 | const TCHAR* tokenImage[]); |
528 | }; |
529 | CL_NS_END |
530 | #endif |
531 | |