1/*------------------------------------------------------------------------------
2* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3*
4* Distributable under the terms of either the Apache License (Version 2.0) or
5* the GNU Lesser General Public License, as specified in the COPYING file.
6------------------------------------------------------------------------------*/
7#ifndef _lucene_queryParser_QueryParser_
8#define _lucene_queryParser_QueryParser_
9
10#include "CLucene/util/Array.h"
11#include "QueryParserTokenManager.h"
12#include "CLucene/document/DateTools.h"
13#include "CLucene/util/VoidMap.h"
14#include "CLucene/util/VoidList.h"
15
16CL_CLASS_DEF(index,Term)
17CL_CLASS_DEF(analysis,Analyzer)
18CL_CLASS_DEF(search,Query)
19CL_CLASS_DEF(search,BooleanClause)
20
21CL_NS_DEF(queryParser)
22
23class QueryParserConstants;
24
25/**
26 * This class is generated by JavaCC. The most important method is
27 * {@link #parse(String)}.
28 *
29 * The syntax for query strings is as follows:
30 * A Query is a series of clauses.
31 * A clause may be prefixed by:
32 * <ul>
33 * <li> a plus (<code>+</code>) or a minus (<code>-</code>) sign, indicating
34 * that the clause is required or prohibited respectively; or
35 * <li> a term followed by a colon, indicating the field to be searched.
36 * This enables one to construct queries which search multiple fields.
37 * </ul>
38 *
39 * A clause may be either:
40 * <ul>
41 * <li> a term, indicating all the documents that contain this term; or
42 * <li> a nested query, enclosed in parentheses. Note that this may be used
43 * with a <code>+</code>/<code>-</code> prefix to require any of a set of
44 * terms.
45 * </ul>
46 *
47 * Thus, in BNF, the query grammar is:
48 * <pre>
49 * Query ::= ( Clause )*
50 * Clause ::= ["+", "-"] [&lt;TERM&gt; ":"] ( &lt;TERM&gt; | "(" Query ")" )
51 * </pre>
52 *
53 * <p>
54 * Examples of appropriately formatted queries can be found in the <a
55 * href="http://lucene.apache.org/java/docs/queryparsersyntax.html">query syntax
56 * documentation</a>.
57 * </p>
58 *
59 * <p>
60 * In {@link RangeQuery}s, QueryParser tries to detect date values, e.g.
61 * <tt>date:[6/1/2005 TO 6/4/2005]</tt> produces a range query that searches
62 * for "date" fields between 2005-06-01 and 2005-06-04. Note that the format
63 * of the accepted input depends on {@link #setLocale(Locale) the locale}.
64 * By default a date is converted into a search term using the deprecated
65 * {@link DateField} for compatibility reasons.
66 * To use the new {@link DateTools} to convert dates, a
67 * {@link org.apache.lucene.document.DateTools.Resolution} has to be set.
68 * </p>
69 * <p>
70 * The date resolution that shall be used for RangeQueries can be set
71 * using {@link #setDateResolution(DateTools.Resolution)}
72 * or {@link #setDateResolution(String, DateTools.Resolution)}. The former
73 * sets the default date resolution for all fields, whereas the latter can
74 * be used to set field specific date resolutions. Field specific date
75 * resolutions take, if set, precedence over the default date resolution.
76 * </p>
77 * <p>
78 * If you use neither {@link DateField} nor {@link DateTools} in your
79 * index, you can create your own
80 * query parser that inherits QueryParser and overwrites
81 * {@link #getRangeQuery(String, String, String, boolean)} to
82 * use a different method for date conversion.
83 * </p>
84 *
85 * <p>Note that QueryParser is <em>not</em> thread-safe.</p>
86 *
87 * @author Brian Goetz
88 * @author Peter Halacsy
89 * @author Tatu Saloranta
90 */
91class CLUCENE_EXPORT QueryParser : public virtual QueryParserConstants
92{
93private:
94 LUCENE_STATIC_CONSTANT(int32_t, CONJ_NONE=0);
95 LUCENE_STATIC_CONSTANT(int32_t, CONJ_AND=1);
96 LUCENE_STATIC_CONSTANT(int32_t, CONJ_OR=2);
97
98 LUCENE_STATIC_CONSTANT(int32_t, MOD_NONE=0);
99 LUCENE_STATIC_CONSTANT(int32_t, MOD_NOT=10);
100 LUCENE_STATIC_CONSTANT(int32_t, MOD_REQ=11);
101
102public:
103 /** The default operator for parsing queries.
104 * Use {@link QueryParser#setDefaultOperator} to change it.
105 */
106 enum Operator {
107 OR_OPERATOR,
108 AND_OPERATOR
109 };
110
111private:
112 /** The actual operator that parser uses to combine query terms */
113 Operator _operator;
114
115 bool lowercaseExpandedTerms;
116 bool useOldRangeQuery;
117 bool allowLeadingWildcard;
118 bool enablePositionIncrements;
119
120 CL_NS(analysis)::Analyzer* analyzer;
121 TCHAR* field;
122 int32_t phraseSlop;
123 float_t fuzzyMinSim;
124 int32_t fuzzyPrefixLength;
125 //TODO: Locale locale = Locale.getDefault();
126
127 // the default date resolution
128 CL_NS(document)::DateTools::Resolution dateResolution;
129 // maps field names to date resolutions
130 typedef CL_NS(util)::CLHashMap<const TCHAR*,
131 CL_NS(document)::DateTools::Resolution,
132 CL_NS(util)::Compare::TChar,
133 CL_NS(util)::Equals::TChar,
134 CL_NS(util)::Deletor::Dummy,
135 CL_NS(util)::Deletor::DummyInt32
136 > FieldToDateResolutionType;
137 FieldToDateResolutionType* fieldToDateResolution;
138
139public:
140 /** Constructs a query parser.
141 * @param f the default field for query terms.
142 * @param a used to find terms in the query text.
143 */
144 QueryParser(const TCHAR* f, CL_NS(analysis)::Analyzer* a);
145 virtual ~QueryParser();
146 void _deleteTokens();
147
148 /** For backward compatibility */
149 static CL_NS(search)::Query* parse(const TCHAR* q, const TCHAR* f, CL_NS(analysis)::Analyzer* a);
150
151 /** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
152 * @param query the query string to be parsed.
153 * @throws ParseException if the parsing fails
154 */
155 CL_NS(search)::Query* parse(const TCHAR* _query);
156
157 /**
158 * @return Returns the analyzer.
159 */
160 CL_NS(analysis)::Analyzer* getAnalyzer() const;
161
162 /**
163 * @return Returns the field.
164 */
165 const TCHAR* getField() const;
166
167 /**
168 * Get the minimal similarity for fuzzy queries.
169 */
170 float_t getFuzzyMinSim() const;
171
172 /**
173 * Set the minimum similarity for fuzzy queries.
174 * Default is 0.5f.
175 */
176 void setFuzzyMinSim(const float_t _fuzzyMinSim);
177
178 /**
179 * Get the prefix length for fuzzy queries.
180 * @return Returns the fuzzyPrefixLength.
181 */
182 int32_t getFuzzyPrefixLength() const;
183
184 /**
185 * Set the prefix length for fuzzy queries. Default is 0.
186 * @param fuzzyPrefixLength The fuzzyPrefixLength to set.
187 */
188 void setFuzzyPrefixLength(const int32_t _fuzzyPrefixLength);
189
190 /**
191 * Sets the default slop for phrases. If zero, then exact phrase matches
192 * are required. Default value is zero.
193 */
194 void setPhraseSlop(const int32_t _phraseSlop);
195
196 /**
197 * Gets the default slop for phrases.
198 */
199 int32_t getPhraseSlop() const;
200
201 /**
202 * Set to <code>true</code> to allow leading wildcard characters.
203 * <p>
204 * When set, <code>*</code> or <code>?</code> are allowed as
205 * the first character of a PrefixQuery and WildcardQuery.
206 * Note that this can produce very slow
207 * queries on big indexes.
208 * <p>
209 * Default: false.
210 */
211 void setAllowLeadingWildcard(const bool _allowLeadingWildcard);
212
213 /**
214 * @see #setAllowLeadingWildcard(boolean)
215 */
216 bool getAllowLeadingWildcard() const;
217
218 /**
219 * Set to <code>true</code> to enable position increments in result query.
220 * <p>
221 * When set, result phrase and multi-phrase queries will
222 * be aware of position increments.
223 * Useful when e.g. a StopFilter increases the position increment of
224 * the token that follows an omitted token.
225 * <p>
226 * Default: false.
227 */
228 void setEnablePositionIncrements(const bool _enable);
229
230 /**
231 * @see #setEnablePositionIncrements(boolean)
232 */
233 bool getEnablePositionIncrements() const;
234
235 /**
236 * Sets the boolean operator of the QueryParser.
237 * In default mode (<code>OR_OPERATOR</code>) terms without any modifiers
238 * are considered optional: for example <code>capital of Hungary</code> is equal to
239 * <code>capital OR of OR Hungary</code>.<br/>
240 * In <code>AND_OPERATOR</code> mode terms are considered to be in conjuction: the
241 * above mentioned query is parsed as <code>capital AND of AND Hungary</code>
242 */
243 void setDefaultOperator(Operator _op);
244
245 /**
246 * Gets implicit operator setting, which will be either AND_OPERATOR
247 * or OR_OPERATOR.
248 */
249 Operator getDefaultOperator() const;
250
251 /**
252 * Whether terms of wildcard, prefix, fuzzy and range queries are to be automatically
253 * lower-cased or not. Default is <code>true</code>.
254 */
255 void setLowercaseExpandedTerms(const bool _lowercaseExpandedTerms);
256
257 /**
258 * @see #setLowercaseExpandedTerms(boolean)
259 */
260 bool getLowercaseExpandedTerms() const;
261
262 /**
263 * By default QueryParser uses new ConstantScoreRangeQuery in preference to RangeQuery
264 * for range queries. This implementation is generally preferable because it
265 * a) Runs faster b) Does not have the scarcity of range terms unduly influence score
266 * c) avoids any "TooManyBooleanClauses" exception.
267 * However, if your application really needs to use the old-fashioned RangeQuery and the above
268 * points are not required then set this option to <code>true</code>
269 * Default is <code>false</code>.
270 */
271 void setUseOldRangeQuery(const bool _useOldRangeQuery);
272
273 /**
274 * @see #setUseOldRangeQuery(boolean)
275 */
276 bool getUseOldRangeQuery() const;
277
278 /**
279 * Set locale used by date range parsing.
280 *
281 void setLocale(const Locale _locale) {
282 locale = _locale;
283 }
284
285
286 * Returns current locale, allowing access by subclasses.
287 *
288 Locale getLocale() const {
289 return locale;
290 }
291 */
292
293 /**
294 * Sets the default date resolution used by RangeQueries for fields for which no
295 * specific date resolutions has been set. Field specific resolutions can be set
296 * with {@link #setDateResolution(String, DateTools.Resolution)}.
297 *
298 * @param dateResolution the default date resolution to set
299 */
300 void setDateResolution(const CL_NS(document)::DateTools::Resolution _dateResolution);
301
302 /**
303 * Sets the date resolution used by RangeQueries for a specific field.
304 *
305 * @param fieldName field for which the date resolution is to be set
306 * @param dateResolution date resolution to set
307 */
308 void setDateResolution(const TCHAR* fieldName, const CL_NS(document)::DateTools::Resolution _dateResolution);
309
310 /**
311 * Returns the date resolution that is used by RangeQueries for the given field.
312 * Returns null (NO_RESOLUTION), if no default or field specific date resolution has been set
313 * for the given field.
314 *
315 */
316 CL_NS(document)::DateTools::Resolution getDateResolution(const TCHAR* fieldName) const;
317
318protected:
319 void addClause(std::vector<CL_NS(search)::BooleanClause*>& clauses, int32_t conj, int32_t mods, CL_NS(search)::Query* q);
320
321 /**
322 * @exception ParseException throw in overridden method to disallow
323 */
324 virtual CL_NS(search)::Query* getFieldQuery(const TCHAR* _field, TCHAR* queryText);
325
326 /**
327 * Base implementation delegates to {@link #getFieldQuery(String,String)}.
328 * This method may be overridden, for example, to return
329 * a SpanNearQuery instead of a PhraseQuery.
330 *
331 * @exception ParseException throw in overridden method to disallow
332 */
333 virtual CL_NS(search)::Query* getFieldQuery(const TCHAR* _field, TCHAR* queryText, const int32_t slop);
334
335 /**
336 * @exception ParseException throw in overridden method to disallow
337 */
338 virtual CL_NS(search)::Query* getRangeQuery(const TCHAR* field, TCHAR* part1, TCHAR* part2, const bool inclusive);
339
340 /**
341 * Factory method for generating query, given a set of clauses.
342 * By default creates a boolean query composed of clauses passed in.
343 *
344 * Can be overridden by extending classes, to modify query being
345 * returned.
346 *
347 * @param clauses Vector that contains {@link BooleanClause} instances
348 * to join.
349 * @param disableCoord true if coord scoring should be disabled.
350 *
351 * @return Resulting {@link Query} object.
352 * @exception ParseException throw in overridden method to disallow
353 */
354 CL_NS(search)::Query* getBooleanQuery(std::vector<CL_NS(search)::BooleanClause*>& clauses, bool disableCoord = false);
355
356 /**
357 * Factory method for generating a query. Called when parser
358 * parses an input term token that contains one or more wildcard
359 * characters (? and *), but is not a prefix term token (one
360 * that has just a single * character at the end)
361 *<p>
362 * Depending on settings, prefix term may be lower-cased
363 * automatically. It will not go through the default Analyzer,
364 * however, since normal Analyzers are unlikely to work properly
365 * with wildcard templates.
366 *<p>
367 * Can be overridden by extending classes, to provide custom handling for
368 * wildcard queries, which may be necessary due to missing analyzer calls.
369 *
370 * @param field Name of the field query will use.
371 * @param termStr Term token that contains one or more wild card
372 * characters (? or *), but is not simple prefix term
373 *
374 * @return Resulting {@link Query} built for the term
375 * @exception ParseException throw in overridden method to disallow
376 */
377 virtual CL_NS(search)::Query* getWildcardQuery(const TCHAR* _field, TCHAR* termStr);
378
379 /**
380 * Factory method for generating a query (similar to
381 * {@link #getWildcardQuery}). Called when parser parses an input term
382 * token that uses prefix notation; that is, contains a single '*' wildcard
383 * character as its last character. Since this is a special case
384 * of generic wildcard term, and such a query can be optimized easily,
385 * this usually results in a different query object.
386 *<p>
387 * Depending on settings, a prefix term may be lower-cased
388 * automatically. It will not go through the default Analyzer,
389 * however, since normal Analyzers are unlikely to work properly
390 * with wildcard templates.
391 *<p>
392 * Can be overridden by extending classes, to provide custom handling for
393 * wild card queries, which may be necessary due to missing analyzer calls.
394 *
395 * @param field Name of the field query will use.
396 * @param termStr Term token to use for building term for the query
397 * (<b>without</b> trailing '*' character!)
398 *
399 * @return Resulting {@link Query} built for the term
400 * @exception ParseException throw in overridden method to disallow
401 */
402 virtual CL_NS(search)::Query* getPrefixQuery(const TCHAR* _field, TCHAR* _termStr);
403
404 /**
405 * Factory method for generating a query (similar to
406 * {@link #getWildcardQuery}). Called when parser parses
407 * an input term token that has the fuzzy suffix (~) appended.
408 *
409 * @param field Name of the field query will use.
410 * @param termStr Term token to use for building term for the query
411 *
412 * @return Resulting {@link Query} built for the term
413 * @exception ParseException throw in overridden method to disallow
414 */
415 virtual CL_NS(search)::Query* getFuzzyQuery(const TCHAR* _field, TCHAR* termStr, const float_t minSimilarity);
416
417private:
418 /**
419 * Returns a String where the escape char has been
420 * removed, or kept only once if there was a double escape.
421 *
422 * Supports escaped unicode characters, e. g. translates
423 * <code>A</code> to <code>A</code>.
424 *
425 * @memory caller is responsible to free the returned string
426 *
427 */
428 TCHAR* discardEscapeChar(TCHAR* input, TCHAR* output=NULL);
429
430 /** Returns the numeric value of the hexadecimal character */
431 static int32_t hexToInt(TCHAR c);
432
433 struct JJCalls;
434
435public:
436 /**
437 * Returns a String where those characters that QueryParser
438 * expects to be escaped are escaped by a preceding <code>\</code>.
439 *
440 * @memory caller is responsible to free the returned string
441 */
442 static TCHAR* escape(const TCHAR* s);
443
444 // * Query ::= ( Clause )*
445 // * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
446 int32_t Conjunction();
447
448 int32_t Modifiers();
449
450 // This makes sure that there is no garbage after the query string
451 CL_NS(search)::Query* TopLevelQuery(TCHAR* _field);
452
453 CL_NS(search)::Query* fQuery(TCHAR* _field);
454
455 CL_NS(search)::Query* fClause(TCHAR* _field);
456
457public:
458 CL_NS(search)::Query* fTerm(const TCHAR* _field);
459
460private:
461 bool jj_2_1(const int32_t xla);
462 bool jj_3R_2();
463 bool jj_3_1();
464 bool jj_3R_3();
465
466public:
467 QueryParserTokenManager* token_source;
468 QueryToken *token, *jj_nt;
469private:
470 QueryToken *_firstToken;
471 int32_t jj_ntk;
472 QueryToken *jj_scanpos, *jj_lastpos;
473 int32_t jj_la;
474public:
475 bool lookingAhead;
476private:
477 bool jj_semLA;
478 int32_t jj_gen;
479 int32_t jj_la1[23];
480 static const int32_t jj_la1_0[];
481 static const int32_t jj_la1_1[];
482 JJCalls* jj_2_rtns;
483 bool jj_rescan;
484 int32_t jj_gc;
485
486public:
487 QueryParser(CharStream* stream);
488 void ReInit(CharStream* stream);
489 QueryParser(QueryParserTokenManager* tm);
490 void ReInit(QueryParserTokenManager* tm);
491
492private:
493 void _init(CharStream* stream);
494 QueryToken* jj_consume_token(const int32_t kind);
495 bool jj_scan_token(const int32_t kind);
496
497public:
498 QueryToken* getNextToken();
499 QueryToken* getToken(int32_t index);
500
501private:
502 int32_t f_jj_ntk();
503
504 CL_NS(util)::CLVector< CL_NS(util)::ValueArray<int32_t>*,
505 CL_NS(util)::Deletor::Object< CL_NS(util)::ValueArray<int32_t> >
506 >* jj_expentries;
507 CL_NS(util)::ValueArray<int32_t>* jj_expentry;
508 int32_t jj_kind;
509 int32_t jj_lasttokens[100];
510 int32_t jj_endpos;
511
512 void jj_add_error_token(const int32_t kind, int32_t pos);
513
514public:
515 void generateParseException();
516
517 //void enable_tracing() {}
518 //void disable_tracing() {}
519
520private:
521 void jj_rescan_token();
522 void jj_save(const int32_t index, int32_t xla);
523
524 TCHAR* getParseExceptionMessage(QueryToken* currentToken,
525 CL_NS(util)::CLVector< CL_NS(util)::ValueArray<int32_t>*,
526 CL_NS(util)::Deletor::Object< CL_NS(util)::ValueArray<int32_t> > >* expectedTokenSequences,
527 const TCHAR* tokenImage[]);
528};
529CL_NS_END
530#endif
531