1 | /*------------------------------------------------------------------------------ |
2 | * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team |
3 | * |
4 | * Distributable under the terms of either the Apache License (Version 2.0) or |
5 | * the GNU Lesser General Public License, as specified in the COPYING file. |
6 | ------------------------------------------------------------------------------*/ |
7 | #ifndef _lucene_search_Sort_ |
8 | #define _lucene_search_Sort_ |
9 | |
10 | |
11 | //#include "CLucene/index/IndexReader.h" |
12 | //#include "SearchHeader.h" |
13 | //#include "CLucene/util/Equator.h" |
14 | CL_CLASS_DEF(index,IndexReader) |
15 | CL_CLASS_DEF(util,Comparable) |
16 | |
17 | CL_NS_DEF(search) |
18 | |
19 | //predefine |
20 | class SortField; |
21 | class Sort; |
22 | |
23 | /** |
24 | * Expert: Compares two ScoreDoc objects for sorting. |
25 | * |
26 | */ |
27 | class CLUCENE_EXPORT ScoreDocComparator { |
28 | protected: |
29 | ScoreDocComparator(){} |
30 | public: |
31 | virtual ~ScoreDocComparator(); |
32 | |
33 | /** |
34 | * Compares two ScoreDoc objects and returns a result indicating their |
35 | * sort order. |
36 | * @param i First ScoreDoc |
37 | * @param j Second ScoreDoc |
38 | * @return a negative integer if <code>i</code> should come before <code>j</code><br> |
39 | * a positive integer if <code>i</code> should come after <code>j</code><br> |
40 | * <code>0</code> if they are equal |
41 | * @see java.util.Comparator |
42 | */ |
43 | virtual int32_t compare (struct ScoreDoc* i, struct ScoreDoc* j) = 0; |
44 | |
45 | /** |
46 | * Returns the value used to sort the given document. The |
47 | * object returned must implement the java.io.Serializable |
48 | * interface. This is used by multisearchers to determine how |
49 | * to collate results from their searchers. |
50 | * @see FieldDoc |
51 | * @param i Document |
52 | * @return Serializable object |
53 | */ |
54 | virtual CL_NS(util)::Comparable* sortValue (struct ScoreDoc* i) = 0; |
55 | |
56 | |
57 | /** |
58 | * Returns the type of sort. Should return <code>SortField.SCORE</code>, |
59 | * <code>SortField.DOC</code>, <code>SortField.STRING</code>, |
60 | * <code>SortField.INTEGER</code>, <code>SortField.FLOAT</code> or |
61 | * <code>SortField.CUSTOM</code>. It is not valid to return |
62 | * <code>SortField.AUTO</code>. |
63 | * This is used by multisearchers to determine how to collate results |
64 | * from their searchers. |
65 | * @return One of the constants in SortField. |
66 | * @see SortField |
67 | */ |
68 | virtual int32_t sortType() = 0; |
69 | |
70 | /** Special comparator for sorting hits according to computed relevance (document score). */ |
71 | static ScoreDocComparator* RELEVANCE(); |
72 | |
73 | /** Special comparator for sorting hits according to index order (document number). */ |
74 | static ScoreDocComparator* INDEXORDER(); |
75 | |
76 | /** Cleanup static data */ |
77 | static CLUCENE_LOCAL void _shutdown(); |
78 | }; |
79 | |
80 | /** |
81 | * Expert: returns a comparator for sorting ScoreDocs. |
82 | * |
83 | */ |
84 | class CLUCENE_EXPORT SortComparatorSource:LUCENE_BASE { |
85 | public: |
86 | virtual ~SortComparatorSource(){ |
87 | } |
88 | |
89 | /** |
90 | * return a reference to a string describing the name of the comparator |
91 | * this is used in the explanation |
92 | */ |
93 | virtual TCHAR* getName() = 0; |
94 | |
95 | virtual size_t hashCode() = 0; |
96 | |
97 | /** |
98 | * Creates a comparator for the field in the given index. |
99 | * @param reader Index to create comparator for. |
100 | * @param fieldname Name of the field to create comparator for. |
101 | * @return Comparator of ScoreDoc objects. |
102 | * @throws IOException If an error occurs reading the index. |
103 | */ |
104 | virtual ScoreDocComparator* newComparator (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname) = 0; |
105 | }; |
106 | |
107 | |
108 | /** |
109 | * Abstract base class for sorting hits returned by a Query. |
110 | * |
111 | * <p>This class should only be used if the other SortField |
112 | * types (SCORE, DOC, STRING, INT, FLOAT) do not provide an |
113 | * adequate sorting. It maintains an internal cache of values which |
114 | * could be quite large. The cache is an array of Comparable, |
115 | * one for each document in the index. There is a distinct |
116 | * Comparable for each unique term in the field - if |
117 | * some documents have the same term in the field, the cache |
118 | * array will have entries which reference the same Comparable. |
119 | * |
120 | */ |
121 | class CLUCENE_EXPORT SortComparator: public SortComparatorSource { |
122 | public: |
123 | virtual ScoreDocComparator* newComparator (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname); |
124 | |
125 | SortComparator(); |
126 | virtual ~SortComparator(); |
127 | |
128 | /** |
129 | * Returns an object which, when sorted according to natural order, |
130 | * will order the Term values in the correct order. |
131 | * <p>For example, if the Terms contained integer values, this method |
132 | * would return <code>new Integer(termtext)</code>. Note that this |
133 | * might not always be the most efficient implementation - for this |
134 | * particular example, a better implementation might be to make a |
135 | * ScoreDocLookupComparator that uses an internal lookup table of int. |
136 | * @param termtext The textual value of the term. |
137 | * @return An object representing <code>termtext</code> that sorts |
138 | * according to the natural order of <code>termtext</code>. |
139 | * @see Comparable |
140 | * @see ScoreDocComparator |
141 | */ |
142 | virtual CL_NS(util)::Comparable* getComparable (const TCHAR* termtext) = 0; |
143 | |
144 | }; |
145 | |
146 | |
147 | /** |
148 | * Stores information about how to sort documents by terms in an individual |
149 | * field. Fields must be indexed in order to sort by them. |
150 | * |
151 | */ |
152 | class CLUCENE_EXPORT SortField:LUCENE_BASE { |
153 | private: |
154 | const TCHAR* field; |
155 | int32_t type; // defaults to determining type dynamically |
156 | //Locale* locale; // defaults to "natural order" (no Locale) |
157 | bool reverse; // defaults to natural order |
158 | SortComparatorSource* factory; |
159 | |
160 | protected: |
161 | SortField (const SortField& clone); |
162 | public: |
163 | virtual ~SortField(); |
164 | |
165 | /** Sort by document score (relevancy). Sort values are Float and higher |
166 | * values are at the front. |
167 | * PORTING: this is the same as SCORE in java, it had to be renamed because |
168 | * SCORE is a system macro on some platforms (AIX). |
169 | */ |
170 | LUCENE_STATIC_CONSTANT(int32_t, DOCSCORE=0); |
171 | |
172 | /** Sort by document number (index order). Sort values are Integer and lower |
173 | * values are at the front. */ |
174 | LUCENE_STATIC_CONSTANT(int32_t, DOC=1); |
175 | |
176 | /** Guess type of sort based on field contents. A regular expression is used |
177 | * to look at the first term indexed for the field and determine if it |
178 | * represents an integer number, a floating point number, or just arbitrary |
179 | * string characters. */ |
180 | LUCENE_STATIC_CONSTANT(int32_t, AUTO=2); |
181 | |
182 | /** Sort using term values as Strings. Sort values are String and lower |
183 | * values are at the front. */ |
184 | LUCENE_STATIC_CONSTANT(int32_t, STRING=3); |
185 | |
186 | /** Sort using term values as encoded Integers. Sort values are Integer and |
187 | * lower values are at the front. */ |
188 | LUCENE_STATIC_CONSTANT(int32_t, INT=4); |
189 | |
190 | /** Sort using term values as encoded Floats. Sort values are Float and |
191 | * lower values are at the front. */ |
192 | LUCENE_STATIC_CONSTANT(int32_t, FLOAT=5); |
193 | |
194 | /** Sort using term values as encoded Longs. Sort values are Long and |
195 | * lower values are at the front. */ |
196 | LUCENE_STATIC_CONSTANT(int32_t, LONG=6); |
197 | |
198 | /** Sort using term values as encoded Doubles. Sort values are Double and |
199 | * lower values are at the front. */ |
200 | LUCENE_STATIC_CONSTANT(int32_t, DOUBLE=7); |
201 | |
202 | |
203 | /** Sort using a custom Comparator. Sort values are any Comparable and |
204 | * sorting is done according to natural order. */ |
205 | LUCENE_STATIC_CONSTANT(int32_t, CUSTOM=9); |
206 | |
207 | // IMPLEMENTATION NOTE: the FieldCache.STRING_INDEX is in the same "namespace" |
208 | // as the above static int values. Any new values must not have the same value |
209 | // as FieldCache.STRING_INDEX. |
210 | |
211 | /** Represents sorting by document score (relevancy). */ |
212 | static SortField* FIELD_SCORE(); |
213 | |
214 | /** Represents sorting by document number (index order). */ |
215 | static SortField* FIELD_DOC(); |
216 | |
217 | /** Cleanup static data */ |
218 | static CLUCENE_LOCAL void _shutdown(); |
219 | |
220 | /** Creates a sort by terms in the given field where the type of term value |
221 | * is determined dynamically ({@link #AUTO AUTO}). |
222 | * @param field Name of field to sort by, cannot be <code>null</code>. |
223 | */ |
224 | SortField (const TCHAR* field); |
225 | //SortField (const TCHAR* field, bool reverse); |
226 | //todo: we cannot make reverse use default field of =false. |
227 | //because bool and int are the same type in c, overloading is not possible |
228 | |
229 | /** Creates a sort, possibly in reverse, by terms in the given field with the |
230 | * type of term values explicitly given. |
231 | * @param field Name of field to sort by. Can be <code>null</code> if |
232 | * <code>type</code> is SCORE or DOC. |
233 | * @param type Type of values in the terms. |
234 | * @param reverse True if natural order should be reversed (default=false). |
235 | */ |
236 | SortField (const TCHAR* field, int32_t type, bool reverse); |
237 | |
238 | /* |
239 | SortField (TCHAR* field, Locale* locale) { |
240 | SortField (TCHAR* field, Locale* locale, bool reverse);*/ |
241 | |
242 | /** Creates a sort, possibly in reverse, with a custom comparison function. |
243 | * @param field Name of field to sort by; cannot be <code>null</code>. |
244 | * @param comparator Returns a comparator for sorting hits. |
245 | * @param reverse True if natural order should be reversed (default=false). |
246 | */ |
247 | SortField (const TCHAR* field, SortComparatorSource* comparator, bool reverse=false); |
248 | |
249 | /** Returns the name of the field. Could return <code>null</code> |
250 | * if the sort is by SCORE or DOC. |
251 | * @return Name of field, possibly <code>null</code>. |
252 | */ |
253 | const TCHAR* getField() const; |
254 | |
255 | SortField* clone() const; |
256 | |
257 | /** Returns the type of contents in the field. |
258 | * @return One of the constants SCORE, DOC, AUTO, STRING, INT or FLOAT. |
259 | */ |
260 | int32_t getType() const; |
261 | |
262 | /** Returns the Locale by which term values are interpreted. |
263 | * May return <code>null</code> if no Locale was specified. |
264 | * @return Locale, or <code>null</code>. |
265 | */ |
266 | /*Locale getLocale() { |
267 | return locale; |
268 | }*/ |
269 | |
270 | /** Returns whether the sort should be reversed. |
271 | * @return True if natural order should be reversed. |
272 | */ |
273 | bool getReverse() const; |
274 | |
275 | SortComparatorSource* getFactory() const; |
276 | |
277 | TCHAR* toString() const; |
278 | }; |
279 | |
280 | |
281 | |
282 | /** |
283 | * Encapsulates sort criteria for returned hits. |
284 | * |
285 | * <p>The fields used to determine sort order must be carefully chosen. |
286 | * Documents must contain a single term in such a field, |
287 | * and the value of the term should indicate the document's relative position in |
288 | * a given sort order. The field must be indexed, but should not be tokenized, |
289 | * and does not need to be stored (unless you happen to want it back with the |
290 | * rest of your document data). In other words: |
291 | * |
292 | * <dl><dd><code>document.add (new Field ("byNumber", Integer.toString(x), false, true, false));</code> |
293 | * </dd></dl> |
294 | * |
295 | * <p><h3>Valid Types of Values</h3> |
296 | * |
297 | * <p>There are three possible kinds of term values which may be put into |
298 | * sorting fields: Integers, Floats, or Strings. Unless |
299 | * {@link SortField SortField} objects are specified, the type of value |
300 | * in the field is determined by parsing the first term in the field. |
301 | * |
302 | * <p>Integer term values should contain only digits and an optional |
303 | * preceeding negative sign. Values must be base 10 and in the range |
304 | * <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive. |
305 | * Documents which should appear first in the sort |
306 | * should have low value integers, later documents high values |
307 | * (i.e. the documents should be numbered <code>1..n</code> where |
308 | * <code>1</code> is the first and <code>n</code> the last). |
309 | * |
310 | * <p>Float term values should conform to values accepted by |
311 | * {@link Float Float.valueOf(String)} (except that <code>NaN</code> |
312 | * and <code>Infinity</code> are not supported). |
313 | * Documents which should appear first in the sort |
314 | * should have low values, later documents high values. |
315 | * |
316 | * <p>String term values can contain any valid String, but should |
317 | * not be tokenized. The values are sorted according to their |
318 | * {@link Comparable natural order}. Note that using this type |
319 | * of term value has higher memory requirements than the other |
320 | * two types. |
321 | * |
322 | * <p><h3>Object Reuse</h3> |
323 | * |
324 | * <p>One of these objects can be |
325 | * used multiple times and the sort order changed between usages. |
326 | * |
327 | * <p>This class is thread safe. |
328 | * |
329 | * <p><h3>Memory Usage</h3> |
330 | * |
331 | * <p>Sorting uses of caches of term values maintained by the |
332 | * internal HitQueue(s). The cache is static and contains an integer |
333 | * or float array of length <code>IndexReader.maxDoc()</code> for each field |
334 | * name for which a sort is performed. In other words, the size of the |
335 | * cache in bytes is: |
336 | * |
337 | * <p><code>4 * IndexReader.maxDoc() * (# of different fields actually used to sort)</code> |
338 | * |
339 | * <p>For String fields, the cache is larger: in addition to the |
340 | * above array, the value of every term in the field is kept in memory. |
341 | * If there are many unique terms in the field, this could |
342 | * be quite large. |
343 | * |
344 | * <p>Note that the size of the cache is not affected by how many |
345 | * fields are in the index and <i>might</i> be used to sort - only by |
346 | * the ones actually used to sort a result set. |
347 | * |
348 | * <p>The cache is cleared each time a new <code>IndexReader</code> is |
349 | * passed in, or if the value returned by <code>maxDoc()</code> |
350 | * changes for the current IndexReader. This class is not set up to |
351 | * be able to efficiently sort hits from more than one index |
352 | * simultaneously. |
353 | * |
354 | */ |
355 | class CLUCENE_EXPORT Sort:LUCENE_BASE { |
356 | // internal representation of the sort criteria |
357 | SortField** fields; |
358 | void clear(); |
359 | public: |
360 | ~Sort(); |
361 | |
362 | /** Represents sorting by computed relevance. Using this sort criteria |
363 | * returns the same results as calling {@link Searcher#search(Query) Searcher#search()} |
364 | * without a sort criteria, only with slightly more overhead. */ |
365 | static Sort* RELEVANCE(); |
366 | |
367 | /** Represents sorting by index order. */ |
368 | static Sort* INDEXORDER(); |
369 | |
370 | /** Cleanup static data */ |
371 | static CLUCENE_LOCAL void _shutdown(); |
372 | |
373 | |
374 | /** Sorts by computed relevance. This is the same sort criteria as |
375 | * calling {@link Searcher#search(Query) Searcher#search()} without a sort criteria, only with |
376 | * slightly more overhead. */ |
377 | Sort(); |
378 | |
379 | /** Sorts possibly in reverse by the terms in <code>field</code> then by |
380 | * index order (document number). The type of value in <code>field</code> is determined |
381 | * automatically. |
382 | * @see SortField#AUTO |
383 | */ |
384 | Sort (const TCHAR* field, bool reverse=false); |
385 | |
386 | /** Sorts in succession by the terms in each field. |
387 | * The type of value in <code>field</code> is determined |
388 | * automatically. |
389 | * @see SortField#AUTO |
390 | */ |
391 | Sort (const TCHAR** fields); |
392 | |
393 | /** Sorts by the criteria in the given SortField. */ |
394 | Sort (SortField* field); |
395 | |
396 | /** Sorts in succession by the criteria in each SortField. */ |
397 | Sort (SortField** fields); |
398 | |
399 | /** Sets the sort to the terms in <code>field</code> possibly in reverse, |
400 | * then by index order (document number). */ |
401 | void setSort (const TCHAR* field, bool reverse=false); |
402 | |
403 | /** Sets the sort to the terms in each field in succession. */ |
404 | void setSort (const TCHAR** fieldnames); |
405 | |
406 | /** Sets the sort to the given criteria. */ |
407 | void setSort (SortField* field); |
408 | |
409 | /** Sets the sort to the given criteria in succession. */ |
410 | void setSort (SortField** fields); |
411 | |
412 | TCHAR* toString() const; |
413 | |
414 | /** |
415 | * Representation of the sort criteria. |
416 | * @return a pointer to the of SortField array used in this sort criteria |
417 | */ |
418 | SortField** getSort() const{ return fields; } |
419 | }; |
420 | |
421 | |
422 | |
423 | |
424 | |
425 | CL_NS_END |
426 | #endif |
427 | |