1/*
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 *
7 * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
8*/
9#ifndef _lucene_index_IndexReader_
10#define _lucene_index_IndexReader_
11
12#if defined(_LUCENE_PRAGMA_ONCE)
13# pragma once
14#endif
15
16#include <QtCore/QString>
17
18#include "CLucene/store/Directory.h"
19#include "CLucene/store/FSDirectory.h"
20#include "CLucene/store/Lock.h"
21#include "CLucene/document/Document.h"
22#include "CLucene/index/TermVector.h"
23#include "SegmentInfos.h"
24#include "Terms.h"
25
26
27CL_NS_DEF(index)
28
29
30/** IndexReader is an abstract class, providing an interface for accessing an
31 index. Search of an index is done entirely through this abstract interface,
32 so that any subclass which implements it is searchable.
33
34 <p> Concrete subclasses of IndexReader are usually constructed with a call to
35 one of the static <code>open()</code> methods, e.g. {@link #open(String)}.
36
37 <p> For efficiency, in this API documents are often referred to via
38 <i>document numbers</i>, non-negative integers which each name a unique
39 document in the index. These document numbers are ephemeral--they may change
40 as documents are added to and deleted from an index. Clients should thus not
41 rely on a given document having the same number between sessions.
42
43 <p> An IndexReader can be opened on a directory for which an IndexWriter is
44 opened already, but it cannot be used to delete documents from the index then.
45*/
46class IndexReader : LUCENE_BASE
47{
48public:
49 //Callback for classes that need to know if IndexReader is closing.
50 typedef void (*CloseCallback)(IndexReader*, void*);
51
52 class CloseCallbackCompare:public CL_NS(util)::Compare::_base{
53 public:
54 bool operator()( CloseCallback t1, CloseCallback t2 ) const{
55 return t1 > t2;
56 }
57 static void doDelete(CloseCallback dummy){
58 }
59 };
60
61
62 enum FieldOption {
63 // all fields
64 ALL = 1,
65 // all indexed fields
66 INDEXED = 2,
67 // all fields which are not indexed
68 UNINDEXED = 4,
69 // all fields which are indexed with termvectors enables
70 INDEXED_WITH_TERMVECTOR = 8,
71 // all fields which are indexed but don't have termvectors enabled
72 INDEXED_NO_TERMVECTOR = 16,
73 // all fields where termvectors are enabled. Please note that only standard termvector fields are returned
74 TERMVECTOR = 32,
75 // all field with termvectors wiht positions enabled
76 TERMVECTOR_WITH_POSITION = 64,
77 // all fields where termvectors with offset position are set
78 TERMVECTOR_WITH_OFFSET = 128,
79 // all fields where termvectors with offset and position values set
80 TERMVECTOR_WITH_POSITION_OFFSET = 256
81 };
82
83
84private:
85 bool stale;
86 bool hasChanges;
87 bool closeDirectory;
88 bool directoryOwner;
89
90 SegmentInfos* segmentInfos;
91 CL_NS(store)::Directory* directory;
92 CL_NS(store)::LuceneLock* writeLock;
93
94 typedef CL_NS(util)::CLSet<CloseCallback, void*, CloseCallbackCompare,
95 CloseCallbackCompare> CloseCallbackMap;
96 CloseCallbackMap closeCallbacks;
97
98 /** Internal use. Implements commit */
99 virtual void doCommit() = 0;
100
101 /**
102 * Tries to acquire the WriteLock on this directory.
103 * this method is only valid if this IndexReader is directory owner.
104 *
105 * @throws IOException If WriteLock cannot be acquired.
106 */
107 void aquireWriteLock();
108protected:
109 /**
110 * Constructor used if IndexReader is not owner of its directory.
111 * This is used for IndexReaders that are used within other IndexReaders that take care or locking directories.
112 *
113 * @param directory Directory where IndexReader files reside.
114 */
115 IndexReader(CL_NS(store)::Directory* dir);
116
117 /**
118 * Constructor used if IndexReader is owner of its directory.
119 * If IndexReader is owner of its directory, it locks its directory in case of write operations.
120 *
121 * @param directory Directory where IndexReader files reside.
122 * @param segmentInfos Used for write-l
123 * @param closeDirectory
124 */
125 IndexReader(CL_NS(store)::Directory* directory, SegmentInfos* segmentInfos, bool closeDirectory);
126
127
128 /// Implements close.
129 virtual void doClose() = 0;
130
131 /** Implements setNorm in subclass.*/
132 virtual void doSetNorm(int32_t doc, const TCHAR* field, uint8_t value) = 0;
133
134 /** Implements actual undeleteAll() in subclass. */
135 virtual void doUndeleteAll() = 0;
136
137
138 /** Implements deletion of the document numbered <code>docNum</code>.
139 * Applications should call {@link #deleteDocument(int32_t)} or {@link #deleteDocuments(Term*)}.
140 */
141 virtual void doDelete(const int32_t docNum) = 0;
142
143public:
144
145 DEFINE_MUTEX(THIS_LOCK)
146
147 ///Do not access this directly, only public so that MultiReader can access it
148 virtual void commit();
149
150
151 /** Undeletes all documents currently marked as deleted in this index.*/
152 void undeleteAll();
153
154 /**
155 * Get a list of unique field names that exist in this index and have the specified
156 * field option information.
157 * @param fldOption specifies which field option should be available for the returned fields
158 * @return Collection of Strings indicating the names of the fields.
159 * @see IndexReader.FieldOption
160 */
161 virtual void getFieldNames(FieldOption fldOption, CL_NS(util)::StringArrayWithDeletor& retarray) = 0;
162
163 _CL_DEPRECATED( getFieldNames(FieldOption, StringArrayWithDeletor&) ) virtual TCHAR** getFieldNames();
164 _CL_DEPRECATED( getFieldNames(FieldOption, StringArrayWithDeletor&) ) virtual TCHAR** getFieldNames(bool indexed);
165
166 /** Returns the byte-encoded normalization factor for the named field of
167 * every document. This is used by the search code to score documents.
168 *
169 * The number of bytes returned is the size of the IndexReader->maxDoc()
170 * MEMORY: The values are cached, so don't delete the returned byte array.
171 * @see Field#setBoost(qreal)
172 */
173 virtual uint8_t* norms(const TCHAR* field) = 0;
174
175
176 /** Reads the byte-encoded normalization factor for the named field of every
177 * document. This is used by the search code to score documents.
178 *
179 * @see Field#setBoost(qreal)
180 */
181 virtual void norms(const TCHAR* field, uint8_t* bytes) = 0;
182
183 /** Expert: Resets the normalization factor for the named field of the named
184 * document.
185 *
186 * @see #norms(TCHAR*)
187 * @see Similarity#decodeNorm(uint8_t)
188 */
189 void setNorm(int32_t doc, const TCHAR* field, qreal value);
190
191 /** Expert: Resets the normalization factor for the named field of the named
192 * document. The norm represents the product of the field's {@link
193 * Field#setBoost(qreal) boost} and its {@link Similarity#lengthNorm(TCHAR*,
194 * int32_t) length normalization}. Thus, to preserve the length normalization
195 * values when resetting this, one should base the new value upon the old.
196 *
197 * @see #norms(TCHAR*)
198 * @see Similarity#decodeNorm(uint8_t)
199 */
200 void setNorm(int32_t doc, const TCHAR* field, uint8_t value);
201
202 /// Release the write lock, if needed.
203 virtual ~IndexReader();
204
205 /// Returns an IndexReader reading the index in an FSDirectory in the named path.
206 static IndexReader* open(const QString& path);
207
208 /// Returns an IndexReader reading the index in the given Directory.
209 static IndexReader* open( CL_NS(store)::Directory* directory, bool closeDirectory=false);
210
211 /**
212 * Returns the time the index in the named directory was last modified.
213 * Do not use this to check whether the reader is still up-to-date, use
214 * {@link #isCurrent()} instead.
215 */
216 static uint64_t lastModified(const QString& directory);
217
218 /**
219 * Returns the time the index in the named directory was last modified.
220 * Do not use this to check whether the reader is still up-to-date, use
221 * {@link #isCurrent()} instead.
222 */
223 static uint64_t lastModified(const CL_NS(store)::Directory* directory);
224
225
226 /**
227 * Reads version number from segments files. The version number is
228 * initialized with a timestamp and then increased by one for each change of
229 * the index.
230 *
231 * @param directory where the index resides.
232 * @return version number.
233 * @throws IOException if segments file cannot be read
234 */
235 static int64_t getCurrentVersion(CL_NS(store)::Directory* directory);
236
237 /**
238 * Reads version number from segments files. The version number is
239 * initialized with a timestamp and then increased by one for each change of
240 * the index.
241 *
242 * @param directory where the index resides.
243 * @return version number.
244 * @throws IOException if segments file cannot be read
245 */
246 static int64_t getCurrentVersion(const QString& directory);
247
248 /**
249 * Version number when this IndexReader was opened.
250 */
251 int64_t getVersion();
252
253 /**
254 * Check whether this IndexReader still works on a current version of the index.
255 * If this is not the case you will need to re-open the IndexReader to
256 * make sure you see the latest changes made to the index.
257 *
258 * @throws IOException
259 */
260 bool isCurrent();
261
262
263 /**
264 * Return an array of term frequency vectors for the specified document.
265 * The array contains a vector for each vectorized field in the document.
266 * Each vector contains terms and frequencies for all terms in a given vectorized field.
267 * If no such fields existed, the method returns null. The term vectors that are
268 * returned my either be of type TermFreqVector or of type TermPositionsVector if
269 * positions or offsets have been stored.
270 *
271 * @param docNumber document for which term frequency vectors are returned
272 * @return array of term frequency vectors. May be null if no term vectors have been
273 * stored for the specified document.
274 * @throws IOException if index cannot be accessed
275 * @see org.apache.lucene.document.Field.TermVector
276 */
277 virtual bool getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result) =0;
278
279 /**
280 * Return a term frequency vector for the specified document and field. The
281 * returned vector contains terms and frequencies for the terms in
282 * the specified field of this document, if the field had the storeTermVector
283 * flag set. If termvectors had been stored with positions or offsets, a
284 * TermPositionsVector is returned.
285 *
286 * @param docNumber document for which the term frequency vector is returned
287 * @param field field for which the term frequency vector is returned.
288 * @return term frequency vector May be null if field does not exist in the specified
289 * document or term vector was not stored.
290 * @throws IOException if index cannot be accessed
291 * @see org.apache.lucene.document.Field.TermVector
292 */
293 virtual TermFreqVector* getTermFreqVector(int32_t docNumber, const TCHAR* field) = 0;
294
295 /**
296 * Returns <code>true</code> if an index exists at the specified directory.
297 * If the directory does not exist or if there is no index in it.
298 * @param directory the directory to check for an index
299 * @return <code>true</code> if an index exists; <code>false</code> otherwise
300 */
301 static bool indexExists(const QString& directory);
302
303 /**
304 * Returns <code>true</code> if an index exists at the specified directory.
305 * If the directory does not exist or if there is no index in it.
306 * @param directory the directory to check for an index
307 * @return <code>true</code> if an index exists; <code>false</code> otherwise
308 * @throws IOException if there is a problem with accessing the index
309 */
310 static bool indexExists(const CL_NS(store)::Directory* directory);
311
312 /** Returns the number of documents in this index. */
313 virtual int32_t numDocs() = 0;
314
315 /** Returns one greater than the largest possible document number.
316 * This may be used to, e.g., determine how big to allocate an array which
317 * will have an element for every document number in an index.
318 */
319 virtual int32_t maxDoc() const = 0;
320
321 /** Gets the stored fields of the <code>n</code><sup>th</sup>
322 * <code>Document</code> in this index.
323 * The fields are not cleared before retrieving the document, so the
324 * object should be new or just cleared.
325 */
326 virtual bool document(int32_t n, CL_NS(document)::Document*) =0;
327
328 _CL_DEPRECATED( document(i, document) ) CL_NS(document)::Document* document(const int32_t n);
329
330 /** Returns true if document <i>n</i> has been deleted */
331 virtual bool isDeleted(const int32_t n) = 0;
332
333 /** Returns true if any documents have been deleted */
334 virtual bool hasDeletions() const = 0;
335
336 /** Returns true if there are norms stored for this field. */
337 virtual bool hasNorms(const TCHAR* field);
338
339 /** Returns an enumeration of all the terms in the index.
340 * The enumeration is ordered by Term.compareTo(). Each term
341 * is greater than all that precede it in the enumeration.
342 * @memory Caller must clean up
343 */
344 virtual TermEnum* terms() const =0;
345
346 /** Returns an enumeration of all terms after a given term.
347 * The enumeration is ordered by Term.compareTo(). Each term
348 * is greater than all that precede it in the enumeration.
349 * @memory Caller must clean up
350 */
351 virtual TermEnum* terms(const Term* t) const = 0;
352
353 /** Returns the number of documents containing the term <code>t</code>. */
354 virtual int32_t docFreq(const Term* t) const = 0;
355
356 /* Returns an unpositioned TermPositions enumerator.
357 * @memory Caller must clean up
358 */
359 virtual TermPositions* termPositions() const = 0;
360
361 /** Returns an enumeration of all the documents which contain
362 * <code>term</code>. For each document, in addition to the document number
363 * and frequency of the term in that document, a list of all of the ordinal
364 * positions of the term in the document is available. Thus, this method
365 * implements the mapping:
366 *
367 * <p><ul>
368 * Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq,
369 * &lt;pos<sub>1</sub>, pos<sub>2</sub>, ...
370 * pos<sub>freq-1</sub>&gt;
371 * &gt;<sup>*</sup>
372 * </ul>
373 * <p> This positional information faciliates phrase and proximity searching.
374 * <p>The enumeration is ordered by document number. Each document number is
375 * greater than all that precede it in the enumeration.
376 * @memory Caller must clean up
377 */
378 TermPositions* termPositions(Term* term) const;
379
380 /** Returns an unpositioned {@link TermDocs} enumerator.
381 * @memory Caller must clean up
382 */
383 virtual TermDocs* termDocs() const = 0;
384
385 /** Returns an enumeration of all the documents which contain
386 * <code>term</code>. For each document, the document number, the frequency of
387 * the term in that document is also provided, for use in search scoring.
388 * Thus, this method implements the mapping:
389 * <p><ul>Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq&gt;<sup>*</sup></ul>
390 * <p>The enumeration is ordered by document number. Each document number
391 * is greater than all that precede it in the enumeration.
392 * @memory Caller must clean up
393 */
394 TermDocs* termDocs(Term* term) const;
395
396 /** Deletes the document numbered <code>docNum</code>. Once a document is
397 * deleted it will not appear in TermDocs or TermPostitions enumerations.
398 * Attempts to read its field with the {@link #document}
399 * method will result in an error. The presence of this document may still be
400 * reflected in the {@link #docFreq} statistic, though
401 * this will be corrected eventually as the index is further modified.
402 */
403 void deleteDocument(const int32_t docNum);
404
405 ///@deprecated. Use deleteDocument instead.
406 _CL_DEPRECATED( deleteDocument ) void deleteDoc(const int32_t docNum)
407 { deleteDocument(docNum); }
408
409 /** Deletes all documents containing <code>term</code>.
410 * This is useful if one uses a document field to hold a unique ID string for
411 * the document. Then to delete such a document, one merely constructs a
412 * term with the appropriate field and the unique ID string as its text and
413 * passes it to this method.
414 * See {@link #deleteDocument(int)} for information about when this deletion will
415 * become effective.
416 * @return the number of documents deleted
417 */
418 int32_t deleteDocuments(Term* term);
419
420 ///@deprecated. Use deleteDocuments instead.
421 _CL_DEPRECATED( deleteDocuments ) int32_t deleteTerm(Term* term){ return deleteDocuments(term); }
422
423 /**
424 * Closes files associated with this index and also saves any new deletions to disk.
425 * No other methods should be called after this has been called.
426 */
427 void close();
428
429 ///Checks if the index in the named directory is currently locked.
430 static bool isLocked(CL_NS(store)::Directory* directory);
431
432 ///Checks if the index in the named directory is currently locked.
433 static bool isLocked(const QString& directory);
434
435
436 ///Forcibly unlocks the index in the named directory.
437 ///Caution: this should only be used by failure recovery code,
438 ///when it is known that no other process nor thread is in fact
439 ///currently accessing this index.
440 static void unlock(CL_NS(store)::Directory* directory);
441 static void unlock(const QString& path);
442
443 /** Returns the directory this index resides in. */
444 CL_NS(store)::Directory* getDirectory() { return directory; }
445
446 /** Returns true if the file is a lucene filename (based on extension or filename) */
447 static bool isLuceneFile(const QString& filename);
448
449 /**
450 * For classes that need to know when the IndexReader closes (such as caches, etc),
451 * should pass their callback function to this.
452 */
453 void addCloseCallback(CloseCallback callback, void* parameter);
454
455protected:
456 class LockWith : public CL_NS(store)::LuceneLockWith<IndexReader*>
457 {
458 public:
459 LockWith(CL_NS(store)::LuceneLock* lock, CL_NS(store)::Directory* dir);
460
461 //Reads the segmentinfo file and depending on the number of segments found
462 //it returns a MultiReader or a SegmentReader
463 IndexReader* doBody();
464
465 private:
466 CL_NS(store)::Directory* directory;
467 };
468 friend class IndexReader::LockWith;
469
470 class CommitLockWith : public CL_NS(store)::LuceneLockWith<void>
471 {
472 public:
473 CommitLockWith(CL_NS(store)::LuceneLock* lock, IndexReader* r);
474 void doBody();
475
476 private:
477 IndexReader* reader;
478 };
479 friend class IndexReader::CommitLockWith;
480};
481
482CL_NS_END
483#endif
484
485
486