1 | /*------------------------------------------------------------------------------ |
2 | * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team |
3 | * |
4 | * Distributable under the terms of either the Apache License (Version 2.0) or |
5 | * the GNU Lesser General Public License, as specified in the COPYING file. |
6 | ------------------------------------------------------------------------------*/ |
7 | #ifndef _lucene_search_Sort_ |
8 | #define _lucene_search_Sort_ |
9 | |
10 | #if defined(_LUCENE_PRAGMA_ONCE) |
11 | # pragma once |
12 | #endif |
13 | |
14 | #include "CLucene/index/IndexReader.h" |
15 | #include "SearchHeader.h" |
16 | |
17 | CL_NS_DEF(search) |
18 | |
19 | class SortField; //predefine |
20 | class Sort; |
21 | |
22 | /** |
23 | * Expert: Compares two ScoreDoc objects for sorting. |
24 | * |
25 | */ |
26 | class ScoreDocComparator:LUCENE_BASE { |
27 | protected: |
28 | ScoreDocComparator(){} |
29 | public: |
30 | virtual ~ScoreDocComparator(); |
31 | // CL_NS(util)::Comparable** cachedValues; |
32 | // ScoreDocComparator(CL_NS(util)::Comparable** cachedValues); |
33 | |
34 | /** |
35 | * Compares two ScoreDoc objects and returns a result indicating their |
36 | * sort order. |
37 | * @param i First ScoreDoc |
38 | * @param j Second ScoreDoc |
39 | * @return <code>-1</code> if <code>i</code> should come before <code>j</code><br><code>1</code> if <code>i</code> should come after <code>j</code><br><code>0</code> if they are equal |
40 | * @see java.util.Comparator |
41 | */ |
42 | virtual int32_t compare (struct ScoreDoc* i, struct ScoreDoc* j) = 0; |
43 | |
44 | /** |
45 | * Returns the value used to sort the given document. The |
46 | * object returned must implement the java.io.Serializable |
47 | * interface. This is used by multisearchers to determine how to collate results from their searchers. |
48 | * @see FieldDoc |
49 | * @param i Document |
50 | * @return Serializable object |
51 | */ |
52 | virtual CL_NS(util)::Comparable* sortValue (struct ScoreDoc* i) = 0; |
53 | |
54 | |
55 | /** |
56 | * Returns the type of sort. Should return <code>SortField.SCORE</code>, <code>SortField.DOC</code>, <code>SortField.STRING</code>, <code>SortField.INTEGER</code>, |
57 | * <code>SortField::FLOAT</code> or <code>SortField.CUSTOM</code>. It is not valid to return <code>SortField.AUTO</code>. |
58 | * This is used by multisearchers to determine how to collate results from their searchers. |
59 | * @return One of the constants in SortField. |
60 | * @see SortField |
61 | */ |
62 | virtual int32_t sortType() = 0; |
63 | |
64 | /** Special comparator for sorting hits according to computed relevance (document score). */ |
65 | static ScoreDocComparator* RELEVANCE; |
66 | |
67 | /** Special comparator for sorting hits according to index order (document number). */ |
68 | static ScoreDocComparator* INDEXORDER; |
69 | }; |
70 | |
71 | /** |
72 | * Expert: returns a comparator for sorting ScoreDocs. |
73 | * |
74 | */ |
75 | class SortComparatorSource:LUCENE_BASE { |
76 | public: |
77 | virtual ~SortComparatorSource(){ |
78 | } |
79 | |
80 | /** |
81 | * return a reference to a string describing the name of the comparator |
82 | * this is used in the explanation |
83 | */ |
84 | virtual TCHAR* getName() = 0; |
85 | |
86 | virtual size_t hashCode() = 0; |
87 | |
88 | /** |
89 | * Creates a comparator for the field in the given index. |
90 | * @param reader Index to create comparator for. |
91 | * @param fieldname Field to create comparator for. |
92 | * @return Comparator of ScoreDoc objects. |
93 | * @throws IOException If an error occurs reading the index. |
94 | */ |
95 | virtual ScoreDocComparator* newComparator (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname) = 0; |
96 | }; |
97 | |
98 | |
99 | /** |
100 | * Abstract base class for sorting hits returned by a Query. |
101 | * |
102 | * <p>This class should only be used if the other SortField |
103 | * types (SCORE, DOC, STRING, INT, FLOAT) do not provide an |
104 | * adequate sorting. It maintains an internal cache of values which |
105 | * could be quite large. The cache is an array of Comparable, |
106 | * one for each document in the index. There is a distinct |
107 | * Comparable for each unique term in the field - if |
108 | * some documents have the same term in the field, the cache |
109 | * array will have entries which reference the same Comparable. |
110 | * |
111 | */ |
112 | class SortComparator: public SortComparatorSource { |
113 | public: |
114 | virtual ScoreDocComparator* newComparator (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname); |
115 | |
116 | SortComparator(); |
117 | virtual ~SortComparator(); |
118 | |
119 | /** |
120 | * Returns an object which, when sorted according to natural order, |
121 | * will order the Term values in the correct order. |
122 | * <p>For example, if the Terms contained integer values, this method |
123 | * would return <code>new Integer(termtext)</code>. Note that this |
124 | * might not always be the most efficient implementation - for this |
125 | * particular example, a better implementation might be to make a |
126 | * ScoreDocLookupComparator that uses an internal lookup table of int. |
127 | * @param termtext The textual value of the term. |
128 | * @return An object representing <code>termtext</code> that sorts |
129 | * according to the natural order of <code>termtext</code>. |
130 | * @see Comparable |
131 | * @see ScoreDocComparator |
132 | */ |
133 | virtual CL_NS(util)::Comparable* getComparable (const TCHAR* termtext) = 0; |
134 | |
135 | }; |
136 | |
137 | |
138 | /** |
139 | * Stores information about how to sort documents by terms in an individual |
140 | * field. Fields must be indexed in order to sort by them. |
141 | * |
142 | */ |
143 | class SortField:LUCENE_BASE { |
144 | private: |
145 | const TCHAR* field; |
146 | int32_t type; // defaults to determining type dynamically |
147 | //Locale* locale; // defaults to "natural order" (no Locale) |
148 | bool reverse; // defaults to natural order |
149 | SortComparatorSource* factory; |
150 | |
151 | protected: |
152 | SortField (const SortField& clone); |
153 | public: |
154 | virtual ~SortField(); |
155 | |
156 | /** Sort by document score (relevancy). Sort values are Float and higher |
157 | * values are at the front. |
158 | * PORTING: this is the same as SCORE in java, it had to be renamed because |
159 | * SCORE is a system macro on some platforms (AIX). |
160 | */ |
161 | LUCENE_STATIC_CONSTANT(int32_t, DOCSCORE=0); |
162 | |
163 | /** Sort by document number (index order). Sort values are Integer and lower |
164 | * values are at the front. */ |
165 | LUCENE_STATIC_CONSTANT(int32_t, DOC=1); |
166 | |
167 | /** Guess type of sort based on field contents. A regular expression is used |
168 | * to look at the first term indexed for the field and determine if it |
169 | * represents an integer number, a floating point number, or just arbitrary |
170 | * string characters. */ |
171 | LUCENE_STATIC_CONSTANT(int32_t, AUTO=2); |
172 | |
173 | /** Sort using term values as Strings. Sort values are String and lower |
174 | * values are at the front. */ |
175 | LUCENE_STATIC_CONSTANT(int32_t, STRING=3); |
176 | |
177 | /** Sort using term values as encoded Integers. Sort values are Integer and |
178 | * lower values are at the front. */ |
179 | LUCENE_STATIC_CONSTANT(int32_t, INT=4); |
180 | |
181 | /** Sort using term values as encoded Floats. Sort values are Float and |
182 | * lower values are at the front. */ |
183 | LUCENE_STATIC_CONSTANT(int32_t, FLOAT=5); |
184 | |
185 | /** Sort using a custom Comparator. Sort values are any Comparable and |
186 | * sorting is done according to natural order. */ |
187 | LUCENE_STATIC_CONSTANT(int32_t, CUSTOM=9); |
188 | |
189 | // IMPLEMENTATION NOTE: the FieldCache.STRING_INDEX is in the same "namespace" |
190 | // as the above static int values. Any new values must not have the same value |
191 | // as FieldCache.STRING_INDEX. |
192 | |
193 | /** Represents sorting by document score (relevancy). */ |
194 | static SortField* FIELD_SCORE; |
195 | |
196 | /** Represents sorting by document number (index order). */ |
197 | static SortField* FIELD_DOC; |
198 | |
199 | SortField (const TCHAR* field); |
200 | //SortField (const TCHAR* field, bool reverse); |
201 | //todo: we cannot make reverse use default field of =false. |
202 | //because bool and int are the same type in c, overloading is not possible |
203 | SortField (const TCHAR* field, int32_t type, bool reverse); |
204 | |
205 | /* |
206 | SortField (TCHAR* field, Locale* locale) { |
207 | SortField (TCHAR* field, Locale* locale, bool reverse);*/ |
208 | |
209 | SortField (const TCHAR* field, SortComparatorSource* comparator, bool reverse=false); |
210 | |
211 | /** Returns the name of the field. Could return <code>null</code> |
212 | * if the sort is by SCORE or DOC. |
213 | * @return Name of field, possibly <code>null</code>. |
214 | */ |
215 | const TCHAR* getField() const { return field; } |
216 | |
217 | SortField* clone() const; |
218 | |
219 | /** Returns the type of contents in the field. |
220 | * @return One of the constants SCORE, DOC, AUTO, STRING, INT or FLOAT. |
221 | */ |
222 | int32_t getType() const { return type; } |
223 | |
224 | /** Returns the Locale by which term values are interpreted. |
225 | * May return <code>null</code> if no Locale was specified. |
226 | * @return Locale, or <code>null</code>. |
227 | */ |
228 | /*Locale getLocale() { |
229 | return locale; |
230 | }*/ |
231 | |
232 | /** Returns whether the sort should be reversed. |
233 | * @return True if natural order should be reversed. |
234 | */ |
235 | bool getReverse() const { return reverse; } |
236 | |
237 | SortComparatorSource* getFactory() const { return factory; } |
238 | |
239 | TCHAR* toString() const; |
240 | }; |
241 | |
242 | |
243 | |
244 | /** |
245 | * Encapsulates sort criteria for returned hits. |
246 | * |
247 | * <p>The fields used to determine sort order must be carefully chosen. |
248 | * Documents must contain a single term in such a field, |
249 | * and the value of the term should indicate the document's relative position in |
250 | * a given sort order. The field must be indexed, but should not be tokenized, |
251 | * and does not need to be stored (unless you happen to want it back with the |
252 | * rest of your document data). In other words: |
253 | * |
254 | * <dl><dd><code>document.add (new Field ("byNumber", Integer.toString(x), false, true, false));</code> |
255 | * </dd></dl> |
256 | * |
257 | * <p><h3>Valid Types of Values</h3> |
258 | * |
259 | * <p>There are three possible kinds of term values which may be put into |
260 | * sorting fields: Integers, Floats, or Strings. Unless |
261 | * {@link SortField SortField} objects are specified, the type of value |
262 | * in the field is determined by parsing the first term in the field. |
263 | * |
264 | * <p>Integer term values should contain only digits and an optional |
265 | * preceeding negative sign. Values must be base 10 and in the range |
266 | * <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive. |
267 | * Documents which should appear first in the sort |
268 | * should have low value integers, later documents high values |
269 | * (i.e. the documents should be numbered <code>1..n</code> where |
270 | * <code>1</code> is the first and <code>n</code> the last). |
271 | * |
272 | * <p>Float term values should conform to values accepted by |
273 | * {@link Float Float.valueOf(String)} (except that <code>NaN</code> |
274 | * and <code>Infinity</code> are not supported). |
275 | * Documents which should appear first in the sort |
276 | * should have low values, later documents high values. |
277 | * |
278 | * <p>String term values can contain any valid String, but should |
279 | * not be tokenized. The values are sorted according to their |
280 | * {@link Comparable natural order}. Note that using this type |
281 | * of term value has higher memory requirements than the other |
282 | * two types. |
283 | * |
284 | * <p><h3>Object Reuse</h3> |
285 | * |
286 | * <p>One of these objects can be |
287 | * used multiple times and the sort order changed between usages. |
288 | * |
289 | * <p>This class is thread safe. |
290 | * |
291 | * <p><h3>Memory Usage</h3> |
292 | * |
293 | * <p>Sorting uses of caches of term values maintained by the |
294 | * internal HitQueue(s). The cache is static and contains an integer |
295 | * or float array of length <code>IndexReader.maxDoc()</code> for each field |
296 | * name for which a sort is performed. In other words, the size of the |
297 | * cache in bytes is: |
298 | * |
299 | * <p><code>4 * IndexReader.maxDoc() * (# of different fields actually used to sort)</code> |
300 | * |
301 | * <p>For String fields, the cache is larger: in addition to the |
302 | * above array, the value of every term in the field is kept in memory. |
303 | * If there are many unique terms in the field, this could |
304 | * be quite large. |
305 | * |
306 | * <p>Note that the size of the cache is not affected by how many |
307 | * fields are in the index and <i>might</i> be used to sort - only by |
308 | * the ones actually used to sort a result set. |
309 | * |
310 | * <p>The cache is cleared each time a new <code>IndexReader</code> is |
311 | * passed in, or if the value returned by <code>maxDoc()</code> |
312 | * changes for the current IndexReader. This class is not set up to |
313 | * be able to efficiently sort hits from more than one index |
314 | * simultaneously. |
315 | * |
316 | */ |
317 | class Sort:LUCENE_BASE { |
318 | // internal representation of the sort criteria |
319 | SortField** fields; |
320 | void clear(); |
321 | public: |
322 | ~Sort(); |
323 | |
324 | /** Represents sorting by computed relevance. Using this sort criteria |
325 | * returns the same results as calling {@link Searcher#search(Query) Searcher#search()} |
326 | * without a sort criteria, only with slightly more overhead. */ |
327 | static Sort* RELEVANCE; |
328 | |
329 | /** Represents sorting by index order. */ |
330 | static Sort* INDEXORDER; |
331 | |
332 | Sort(); |
333 | Sort (const TCHAR* field, bool reverse=false); |
334 | Sort (const TCHAR** fields); |
335 | Sort (SortField* field); |
336 | Sort (SortField** fields); |
337 | void setSort (const TCHAR* field, bool reverse=false); |
338 | void setSort (const TCHAR** fieldnames); |
339 | void setSort (SortField* field); |
340 | void setSort (SortField** fields); |
341 | |
342 | TCHAR* toString() const; |
343 | |
344 | /** |
345 | * Representation of the sort criteria. |
346 | * @return a pointer to the of SortField array used in this sort criteria |
347 | */ |
348 | SortField** getSort() const{ return fields; } |
349 | }; |
350 | |
351 | |
352 | |
353 | |
354 | |
355 | CL_NS_END |
356 | #endif |
357 | |