1 | /* |
2 | * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team |
3 | * |
4 | * Distributable under the terms of either the Apache License (Version 2.0) or |
5 | * the GNU Lesser General Public License, as specified in the COPYING file. |
6 | * |
7 | * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved. |
8 | */ |
9 | #ifndef _lucene_index_IndexReader_ |
10 | #define _lucene_index_IndexReader_ |
11 | |
12 | #if defined(_LUCENE_PRAGMA_ONCE) |
13 | # pragma once |
14 | #endif |
15 | |
16 | #include <QtCore/QString> |
17 | |
18 | #include "CLucene/store/Directory.h" |
19 | #include "CLucene/store/FSDirectory.h" |
20 | #include "CLucene/store/Lock.h" |
21 | #include "CLucene/document/Document.h" |
22 | #include "CLucene/index/TermVector.h" |
23 | #include "SegmentInfos.h" |
24 | #include "Terms.h" |
25 | |
26 | |
27 | CL_NS_DEF(index) |
28 | |
29 | |
30 | /** IndexReader is an abstract class, providing an interface for accessing an |
31 | index. Search of an index is done entirely through this abstract interface, |
32 | so that any subclass which implements it is searchable. |
33 | |
34 | <p> Concrete subclasses of IndexReader are usually constructed with a call to |
35 | one of the static <code>open()</code> methods, e.g. {@link #open(String)}. |
36 | |
37 | <p> For efficiency, in this API documents are often referred to via |
38 | <i>document numbers</i>, non-negative integers which each name a unique |
39 | document in the index. These document numbers are ephemeral--they may change |
40 | as documents are added to and deleted from an index. Clients should thus not |
41 | rely on a given document having the same number between sessions. |
42 | |
43 | <p> An IndexReader can be opened on a directory for which an IndexWriter is |
44 | opened already, but it cannot be used to delete documents from the index then. |
45 | */ |
46 | class IndexReader : LUCENE_BASE |
47 | { |
48 | public: |
49 | //Callback for classes that need to know if IndexReader is closing. |
50 | typedef void (*CloseCallback)(IndexReader*, void*); |
51 | |
52 | class CloseCallbackCompare:public CL_NS(util)::Compare::_base{ |
53 | public: |
54 | bool operator()( CloseCallback t1, CloseCallback t2 ) const{ |
55 | return t1 > t2; |
56 | } |
57 | static void doDelete(CloseCallback dummy){ |
58 | } |
59 | }; |
60 | |
61 | |
62 | enum FieldOption { |
63 | // all fields |
64 | ALL = 1, |
65 | // all indexed fields |
66 | INDEXED = 2, |
67 | // all fields which are not indexed |
68 | UNINDEXED = 4, |
69 | // all fields which are indexed with termvectors enables |
70 | INDEXED_WITH_TERMVECTOR = 8, |
71 | // all fields which are indexed but don't have termvectors enabled |
72 | INDEXED_NO_TERMVECTOR = 16, |
73 | // all fields where termvectors are enabled. Please note that only standard termvector fields are returned |
74 | TERMVECTOR = 32, |
75 | // all field with termvectors wiht positions enabled |
76 | TERMVECTOR_WITH_POSITION = 64, |
77 | // all fields where termvectors with offset position are set |
78 | TERMVECTOR_WITH_OFFSET = 128, |
79 | // all fields where termvectors with offset and position values set |
80 | TERMVECTOR_WITH_POSITION_OFFSET = 256 |
81 | }; |
82 | |
83 | |
84 | private: |
85 | bool stale; |
86 | bool hasChanges; |
87 | bool closeDirectory; |
88 | bool directoryOwner; |
89 | |
90 | SegmentInfos* segmentInfos; |
91 | CL_NS(store)::Directory* directory; |
92 | CL_NS(store)::LuceneLock* writeLock; |
93 | |
94 | typedef CL_NS(util)::CLSet<CloseCallback, void*, CloseCallbackCompare, |
95 | CloseCallbackCompare> CloseCallbackMap; |
96 | CloseCallbackMap closeCallbacks; |
97 | |
98 | /** Internal use. Implements commit */ |
99 | virtual void doCommit() = 0; |
100 | |
101 | /** |
102 | * Tries to acquire the WriteLock on this directory. |
103 | * this method is only valid if this IndexReader is directory owner. |
104 | * |
105 | * @throws IOException If WriteLock cannot be acquired. |
106 | */ |
107 | void aquireWriteLock(); |
108 | protected: |
109 | /** |
110 | * Constructor used if IndexReader is not owner of its directory. |
111 | * This is used for IndexReaders that are used within other IndexReaders that take care or locking directories. |
112 | * |
113 | * @param directory Directory where IndexReader files reside. |
114 | */ |
115 | IndexReader(CL_NS(store)::Directory* dir); |
116 | |
117 | /** |
118 | * Constructor used if IndexReader is owner of its directory. |
119 | * If IndexReader is owner of its directory, it locks its directory in case of write operations. |
120 | * |
121 | * @param directory Directory where IndexReader files reside. |
122 | * @param segmentInfos Used for write-l |
123 | * @param closeDirectory |
124 | */ |
125 | IndexReader(CL_NS(store)::Directory* directory, SegmentInfos* segmentInfos, bool closeDirectory); |
126 | |
127 | |
128 | /// Implements close. |
129 | virtual void doClose() = 0; |
130 | |
131 | /** Implements setNorm in subclass.*/ |
132 | virtual void doSetNorm(int32_t doc, const TCHAR* field, uint8_t value) = 0; |
133 | |
134 | /** Implements actual undeleteAll() in subclass. */ |
135 | virtual void doUndeleteAll() = 0; |
136 | |
137 | |
138 | /** Implements deletion of the document numbered <code>docNum</code>. |
139 | * Applications should call {@link #deleteDocument(int32_t)} or {@link #deleteDocuments(Term*)}. |
140 | */ |
141 | virtual void doDelete(const int32_t docNum) = 0; |
142 | |
143 | public: |
144 | |
145 | DEFINE_MUTEX(THIS_LOCK) |
146 | |
147 | ///Do not access this directly, only public so that MultiReader can access it |
148 | virtual void commit(); |
149 | |
150 | |
151 | /** Undeletes all documents currently marked as deleted in this index.*/ |
152 | void undeleteAll(); |
153 | |
154 | /** |
155 | * Get a list of unique field names that exist in this index and have the specified |
156 | * field option information. |
157 | * @param fldOption specifies which field option should be available for the returned fields |
158 | * @return Collection of Strings indicating the names of the fields. |
159 | * @see IndexReader.FieldOption |
160 | */ |
161 | virtual void getFieldNames(FieldOption fldOption, CL_NS(util)::StringArrayWithDeletor& retarray) = 0; |
162 | |
163 | _CL_DEPRECATED( getFieldNames(FieldOption, StringArrayWithDeletor&) ) virtual TCHAR** getFieldNames(); |
164 | _CL_DEPRECATED( getFieldNames(FieldOption, StringArrayWithDeletor&) ) virtual TCHAR** getFieldNames(bool indexed); |
165 | |
166 | /** Returns the byte-encoded normalization factor for the named field of |
167 | * every document. This is used by the search code to score documents. |
168 | * |
169 | * The number of bytes returned is the size of the IndexReader->maxDoc() |
170 | * MEMORY: The values are cached, so don't delete the returned byte array. |
171 | * @see Field#setBoost(qreal) |
172 | */ |
173 | virtual uint8_t* norms(const TCHAR* field) = 0; |
174 | |
175 | |
176 | /** Reads the byte-encoded normalization factor for the named field of every |
177 | * document. This is used by the search code to score documents. |
178 | * |
179 | * @see Field#setBoost(qreal) |
180 | */ |
181 | virtual void norms(const TCHAR* field, uint8_t* bytes) = 0; |
182 | |
183 | /** Expert: Resets the normalization factor for the named field of the named |
184 | * document. |
185 | * |
186 | * @see #norms(TCHAR*) |
187 | * @see Similarity#decodeNorm(uint8_t) |
188 | */ |
189 | void setNorm(int32_t doc, const TCHAR* field, qreal value); |
190 | |
191 | /** Expert: Resets the normalization factor for the named field of the named |
192 | * document. The norm represents the product of the field's {@link |
193 | * Field#setBoost(qreal) boost} and its {@link Similarity#lengthNorm(TCHAR*, |
194 | * int32_t) length normalization}. Thus, to preserve the length normalization |
195 | * values when resetting this, one should base the new value upon the old. |
196 | * |
197 | * @see #norms(TCHAR*) |
198 | * @see Similarity#decodeNorm(uint8_t) |
199 | */ |
200 | void setNorm(int32_t doc, const TCHAR* field, uint8_t value); |
201 | |
202 | /// Release the write lock, if needed. |
203 | virtual ~IndexReader(); |
204 | |
205 | /// Returns an IndexReader reading the index in an FSDirectory in the named path. |
206 | static IndexReader* open(const QString& path); |
207 | |
208 | /// Returns an IndexReader reading the index in the given Directory. |
209 | static IndexReader* open( CL_NS(store)::Directory* directory, bool closeDirectory=false); |
210 | |
211 | /** |
212 | * Returns the time the index in the named directory was last modified. |
213 | * Do not use this to check whether the reader is still up-to-date, use |
214 | * {@link #isCurrent()} instead. |
215 | */ |
216 | static uint64_t lastModified(const QString& directory); |
217 | |
218 | /** |
219 | * Returns the time the index in the named directory was last modified. |
220 | * Do not use this to check whether the reader is still up-to-date, use |
221 | * {@link #isCurrent()} instead. |
222 | */ |
223 | static uint64_t lastModified(const CL_NS(store)::Directory* directory); |
224 | |
225 | |
226 | /** |
227 | * Reads version number from segments files. The version number is |
228 | * initialized with a timestamp and then increased by one for each change of |
229 | * the index. |
230 | * |
231 | * @param directory where the index resides. |
232 | * @return version number. |
233 | * @throws IOException if segments file cannot be read |
234 | */ |
235 | static int64_t getCurrentVersion(CL_NS(store)::Directory* directory); |
236 | |
237 | /** |
238 | * Reads version number from segments files. The version number is |
239 | * initialized with a timestamp and then increased by one for each change of |
240 | * the index. |
241 | * |
242 | * @param directory where the index resides. |
243 | * @return version number. |
244 | * @throws IOException if segments file cannot be read |
245 | */ |
246 | static int64_t getCurrentVersion(const QString& directory); |
247 | |
248 | /** |
249 | * Version number when this IndexReader was opened. |
250 | */ |
251 | int64_t getVersion(); |
252 | |
253 | /** |
254 | * Check whether this IndexReader still works on a current version of the index. |
255 | * If this is not the case you will need to re-open the IndexReader to |
256 | * make sure you see the latest changes made to the index. |
257 | * |
258 | * @throws IOException |
259 | */ |
260 | bool isCurrent(); |
261 | |
262 | |
263 | /** |
264 | * Return an array of term frequency vectors for the specified document. |
265 | * The array contains a vector for each vectorized field in the document. |
266 | * Each vector contains terms and frequencies for all terms in a given vectorized field. |
267 | * If no such fields existed, the method returns null. The term vectors that are |
268 | * returned my either be of type TermFreqVector or of type TermPositionsVector if |
269 | * positions or offsets have been stored. |
270 | * |
271 | * @param docNumber document for which term frequency vectors are returned |
272 | * @return array of term frequency vectors. May be null if no term vectors have been |
273 | * stored for the specified document. |
274 | * @throws IOException if index cannot be accessed |
275 | * @see org.apache.lucene.document.Field.TermVector |
276 | */ |
277 | virtual bool getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result) =0; |
278 | |
279 | /** |
280 | * Return a term frequency vector for the specified document and field. The |
281 | * returned vector contains terms and frequencies for the terms in |
282 | * the specified field of this document, if the field had the storeTermVector |
283 | * flag set. If termvectors had been stored with positions or offsets, a |
284 | * TermPositionsVector is returned. |
285 | * |
286 | * @param docNumber document for which the term frequency vector is returned |
287 | * @param field field for which the term frequency vector is returned. |
288 | * @return term frequency vector May be null if field does not exist in the specified |
289 | * document or term vector was not stored. |
290 | * @throws IOException if index cannot be accessed |
291 | * @see org.apache.lucene.document.Field.TermVector |
292 | */ |
293 | virtual TermFreqVector* getTermFreqVector(int32_t docNumber, const TCHAR* field) = 0; |
294 | |
295 | /** |
296 | * Returns <code>true</code> if an index exists at the specified directory. |
297 | * If the directory does not exist or if there is no index in it. |
298 | * @param directory the directory to check for an index |
299 | * @return <code>true</code> if an index exists; <code>false</code> otherwise |
300 | */ |
301 | static bool indexExists(const QString& directory); |
302 | |
303 | /** |
304 | * Returns <code>true</code> if an index exists at the specified directory. |
305 | * If the directory does not exist or if there is no index in it. |
306 | * @param directory the directory to check for an index |
307 | * @return <code>true</code> if an index exists; <code>false</code> otherwise |
308 | * @throws IOException if there is a problem with accessing the index |
309 | */ |
310 | static bool indexExists(const CL_NS(store)::Directory* directory); |
311 | |
312 | /** Returns the number of documents in this index. */ |
313 | virtual int32_t numDocs() = 0; |
314 | |
315 | /** Returns one greater than the largest possible document number. |
316 | * This may be used to, e.g., determine how big to allocate an array which |
317 | * will have an element for every document number in an index. |
318 | */ |
319 | virtual int32_t maxDoc() const = 0; |
320 | |
321 | /** Gets the stored fields of the <code>n</code><sup>th</sup> |
322 | * <code>Document</code> in this index. |
323 | * The fields are not cleared before retrieving the document, so the |
324 | * object should be new or just cleared. |
325 | */ |
326 | virtual bool document(int32_t n, CL_NS(document)::Document*) =0; |
327 | |
328 | _CL_DEPRECATED( document(i, document) ) CL_NS(document)::Document* document(const int32_t n); |
329 | |
330 | /** Returns true if document <i>n</i> has been deleted */ |
331 | virtual bool isDeleted(const int32_t n) = 0; |
332 | |
333 | /** Returns true if any documents have been deleted */ |
334 | virtual bool hasDeletions() const = 0; |
335 | |
336 | /** Returns true if there are norms stored for this field. */ |
337 | virtual bool hasNorms(const TCHAR* field); |
338 | |
339 | /** Returns an enumeration of all the terms in the index. |
340 | * The enumeration is ordered by Term.compareTo(). Each term |
341 | * is greater than all that precede it in the enumeration. |
342 | * @memory Caller must clean up |
343 | */ |
344 | virtual TermEnum* terms() const =0; |
345 | |
346 | /** Returns an enumeration of all terms after a given term. |
347 | * The enumeration is ordered by Term.compareTo(). Each term |
348 | * is greater than all that precede it in the enumeration. |
349 | * @memory Caller must clean up |
350 | */ |
351 | virtual TermEnum* terms(const Term* t) const = 0; |
352 | |
353 | /** Returns the number of documents containing the term <code>t</code>. */ |
354 | virtual int32_t docFreq(const Term* t) const = 0; |
355 | |
356 | /* Returns an unpositioned TermPositions enumerator. |
357 | * @memory Caller must clean up |
358 | */ |
359 | virtual TermPositions* termPositions() const = 0; |
360 | |
361 | /** Returns an enumeration of all the documents which contain |
362 | * <code>term</code>. For each document, in addition to the document number |
363 | * and frequency of the term in that document, a list of all of the ordinal |
364 | * positions of the term in the document is available. Thus, this method |
365 | * implements the mapping: |
366 | * |
367 | * <p><ul> |
368 | * Term => <docNum, freq, |
369 | * <pos<sub>1</sub>, pos<sub>2</sub>, ... |
370 | * pos<sub>freq-1</sub>> |
371 | * ><sup>*</sup> |
372 | * </ul> |
373 | * <p> This positional information faciliates phrase and proximity searching. |
374 | * <p>The enumeration is ordered by document number. Each document number is |
375 | * greater than all that precede it in the enumeration. |
376 | * @memory Caller must clean up |
377 | */ |
378 | TermPositions* termPositions(Term* term) const; |
379 | |
380 | /** Returns an unpositioned {@link TermDocs} enumerator. |
381 | * @memory Caller must clean up |
382 | */ |
383 | virtual TermDocs* termDocs() const = 0; |
384 | |
385 | /** Returns an enumeration of all the documents which contain |
386 | * <code>term</code>. For each document, the document number, the frequency of |
387 | * the term in that document is also provided, for use in search scoring. |
388 | * Thus, this method implements the mapping: |
389 | * <p><ul>Term => <docNum, freq><sup>*</sup></ul> |
390 | * <p>The enumeration is ordered by document number. Each document number |
391 | * is greater than all that precede it in the enumeration. |
392 | * @memory Caller must clean up |
393 | */ |
394 | TermDocs* termDocs(Term* term) const; |
395 | |
396 | /** Deletes the document numbered <code>docNum</code>. Once a document is |
397 | * deleted it will not appear in TermDocs or TermPostitions enumerations. |
398 | * Attempts to read its field with the {@link #document} |
399 | * method will result in an error. The presence of this document may still be |
400 | * reflected in the {@link #docFreq} statistic, though |
401 | * this will be corrected eventually as the index is further modified. |
402 | */ |
403 | void deleteDocument(const int32_t docNum); |
404 | |
405 | ///@deprecated. Use deleteDocument instead. |
406 | _CL_DEPRECATED( deleteDocument ) void deleteDoc(const int32_t docNum) |
407 | { deleteDocument(docNum); } |
408 | |
409 | /** Deletes all documents containing <code>term</code>. |
410 | * This is useful if one uses a document field to hold a unique ID string for |
411 | * the document. Then to delete such a document, one merely constructs a |
412 | * term with the appropriate field and the unique ID string as its text and |
413 | * passes it to this method. |
414 | * See {@link #deleteDocument(int)} for information about when this deletion will |
415 | * become effective. |
416 | * @return the number of documents deleted |
417 | */ |
418 | int32_t deleteDocuments(Term* term); |
419 | |
420 | ///@deprecated. Use deleteDocuments instead. |
421 | _CL_DEPRECATED( deleteDocuments ) int32_t deleteTerm(Term* term){ return deleteDocuments(term); } |
422 | |
423 | /** |
424 | * Closes files associated with this index and also saves any new deletions to disk. |
425 | * No other methods should be called after this has been called. |
426 | */ |
427 | void close(); |
428 | |
429 | ///Checks if the index in the named directory is currently locked. |
430 | static bool isLocked(CL_NS(store)::Directory* directory); |
431 | |
432 | ///Checks if the index in the named directory is currently locked. |
433 | static bool isLocked(const QString& directory); |
434 | |
435 | |
436 | ///Forcibly unlocks the index in the named directory. |
437 | ///Caution: this should only be used by failure recovery code, |
438 | ///when it is known that no other process nor thread is in fact |
439 | ///currently accessing this index. |
440 | static void unlock(CL_NS(store)::Directory* directory); |
441 | static void unlock(const QString& path); |
442 | |
443 | /** Returns the directory this index resides in. */ |
444 | CL_NS(store)::Directory* getDirectory() { return directory; } |
445 | |
446 | /** Returns true if the file is a lucene filename (based on extension or filename) */ |
447 | static bool isLuceneFile(const QString& filename); |
448 | |
449 | /** |
450 | * For classes that need to know when the IndexReader closes (such as caches, etc), |
451 | * should pass their callback function to this. |
452 | */ |
453 | void addCloseCallback(CloseCallback callback, void* parameter); |
454 | |
455 | protected: |
456 | class LockWith : public CL_NS(store)::LuceneLockWith<IndexReader*> |
457 | { |
458 | public: |
459 | LockWith(CL_NS(store)::LuceneLock* lock, CL_NS(store)::Directory* dir); |
460 | |
461 | //Reads the segmentinfo file and depending on the number of segments found |
462 | //it returns a MultiReader or a SegmentReader |
463 | IndexReader* doBody(); |
464 | |
465 | private: |
466 | CL_NS(store)::Directory* directory; |
467 | }; |
468 | friend class IndexReader::LockWith; |
469 | |
470 | class CommitLockWith : public CL_NS(store)::LuceneLockWith<void> |
471 | { |
472 | public: |
473 | CommitLockWith(CL_NS(store)::LuceneLock* lock, IndexReader* r); |
474 | void doBody(); |
475 | |
476 | private: |
477 | IndexReader* reader; |
478 | }; |
479 | friend class IndexReader::CommitLockWith; |
480 | }; |
481 | |
482 | CL_NS_END |
483 | #endif |
484 | |
485 | |
486 | |