1 | /** @file termgenerator.h |
2 | * @brief parse free text and generate terms |
3 | */ |
4 | /* Copyright (C) 2007,2009,2011,2012 Olly Betts |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by |
8 | * the Free Software Foundation; either version 2 of the License, or |
9 | * (at your option) any later version. |
10 | * |
11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU General Public License |
17 | * along with this program; if not, write to the Free Software |
18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
19 | */ |
20 | |
21 | #ifndef XAPIAN_INCLUDED_TERMGENERATOR_H |
22 | #define XAPIAN_INCLUDED_TERMGENERATOR_H |
23 | |
24 | #include <xapian/base.h> |
25 | #include <xapian/types.h> |
26 | #include <xapian/unicode.h> |
27 | #include <xapian/visibility.h> |
28 | |
29 | #include <string> |
30 | |
31 | namespace Xapian { |
32 | |
33 | class Document; |
34 | class Stem; |
35 | class Stopper; |
36 | class WritableDatabase; |
37 | |
38 | /** Parses a piece of text and generate terms. |
39 | * |
40 | * This module takes a piece of text and parses it to produce words which are |
41 | * then used to generate suitable terms for indexing. The terms generated are |
42 | * suitable for use with Query objects produced by the QueryParser class. |
43 | */ |
44 | class XAPIAN_VISIBILITY_DEFAULT TermGenerator { |
45 | public: |
46 | /// @private @internal Class representing the TermGenerator internals. |
47 | class Internal; |
48 | /// @private @internal Reference counted internals. |
49 | Xapian::Internal::RefCntPtr<Internal> internal; |
50 | |
51 | /// Copy constructor. |
52 | TermGenerator(const TermGenerator & o); |
53 | |
54 | /// Assignment. |
55 | TermGenerator & operator=(const TermGenerator & o); |
56 | |
57 | /// Default constructor. |
58 | TermGenerator(); |
59 | |
60 | /// Destructor. |
61 | ~TermGenerator(); |
62 | |
63 | /// Set the Xapian::Stem object to be used for generating stemmed terms. |
64 | void set_stemmer(const Xapian::Stem & stemmer); |
65 | |
66 | /** Set the Xapian::Stopper object to be used for identifying stopwords. |
67 | * |
68 | * Stemmed forms of stopwords aren't indexed, but unstemmed forms still |
69 | * are so that searches for phrases including stop words still work. |
70 | * |
71 | * @param stop The Stopper object to set (default NULL, which means no |
72 | * stopwords). |
73 | */ |
74 | void set_stopper(const Xapian::Stopper *stop = NULL); |
75 | |
76 | /// Set the current document. |
77 | void set_document(const Xapian::Document & doc); |
78 | |
79 | /// Get the current document. |
80 | const Xapian::Document & get_document() const; |
81 | |
82 | /// Set the database to index spelling data to. |
83 | void set_database(const Xapian::WritableDatabase &db); |
84 | |
85 | /// Flags to OR together and pass to TermGenerator::set_flags(). |
86 | enum flags { |
87 | /// Index data required for spelling correction. |
88 | FLAG_SPELLING = 128 // Value matches QueryParser flag. |
89 | }; |
90 | |
91 | /// Stemming strategies, for use with set_stemming_strategy(). |
92 | typedef enum { STEM_NONE, STEM_SOME, STEM_ALL, STEM_ALL_Z } stem_strategy; |
93 | |
94 | /** Set flags. |
95 | * |
96 | * The new value of flags is: (flags & mask) ^ toggle |
97 | * |
98 | * To just set the flags, pass the new flags in toggle and the |
99 | * default value for mask. |
100 | * |
101 | * @param toggle Flags to XOR. |
102 | * @param mask Flags to AND with first. |
103 | * |
104 | * @return The old flags setting. |
105 | */ |
106 | flags set_flags(flags toggle, flags mask = flags(0)); |
107 | |
108 | /** Set the stemming strategy. |
109 | * |
110 | * This method controls how the stemming algorithm is applied. It was |
111 | * new in Xapian 1.3.1. |
112 | * |
113 | * @param strategy The strategy to use - possible values are: |
114 | * - STEM_NONE: Don't perform any stemming - only unstemmed terms |
115 | * are generated. |
116 | * - STEM_SOME: Generate both stemmed (with a "Z" prefix) and unstemmed |
117 | * terms. This is the default strategy. |
118 | * - STEM_ALL: Generate only stemmed terms (but without a "Z" prefix). |
119 | * - STEM_ALL_Z: Generate only stemmed terms (with a "Z" prefix). |
120 | */ |
121 | void set_stemming_strategy(stem_strategy strategy); |
122 | |
123 | /** Set the maximum length word to index. |
124 | * |
125 | * The limit is on the length of a word prior to stemming and prior to |
126 | * adding any term prefix. |
127 | * |
128 | * The backends mostly impose a limit on the length of terms (often of |
129 | * about 240 bytes), but it's generally useful to have a lower limit to |
130 | * help prevent the index being bloated by useless junk terms from trying |
131 | * to indexing things like binary data, uuencoded data, ASCII art, etc. |
132 | * |
133 | * This method was new in Xapian 1.3.1. |
134 | * |
135 | * @param max_word_length The maximum length word to index, in bytes in |
136 | * UTF-8 representation. Default is 64. |
137 | */ |
138 | void set_max_word_length(unsigned max_word_length); |
139 | |
140 | /** Index some text. |
141 | * |
142 | * @param itor Utf8Iterator pointing to the text to index. |
143 | * @param wdf_inc The wdf increment (default 1). |
144 | * @param prefix The term prefix to use (default is no prefix). |
145 | */ |
146 | void index_text(const Xapian::Utf8Iterator & itor, |
147 | Xapian::termcount wdf_inc = 1, |
148 | const std::string & prefix = std::string()); |
149 | |
150 | /** Index some text in a std::string. |
151 | * |
152 | * @param text The text to index. |
153 | * @param wdf_inc The wdf increment (default 1). |
154 | * @param prefix The term prefix to use (default is no prefix). |
155 | */ |
156 | void index_text(const std::string & text, |
157 | Xapian::termcount wdf_inc = 1, |
158 | const std::string & prefix = std::string()) { |
159 | return index_text(Utf8Iterator(text), wdf_inc, prefix); |
160 | } |
161 | |
162 | /** Index some text without positional information. |
163 | * |
164 | * Just like index_text, but no positional information is generated. This |
165 | * means that the database will be significantly smaller, but that phrase |
166 | * searching and NEAR won't be supported. |
167 | * |
168 | * @param itor Utf8Iterator pointing to the text to index. |
169 | * @param wdf_inc The wdf increment (default 1). |
170 | * @param prefix The term prefix to use (default is no prefix). |
171 | */ |
172 | void index_text_without_positions(const Xapian::Utf8Iterator & itor, |
173 | Xapian::termcount wdf_inc = 1, |
174 | const std::string & prefix = std::string()); |
175 | |
176 | /** Index some text in a std::string without positional information. |
177 | * |
178 | * Just like index_text, but no positional information is generated. This |
179 | * means that the database will be significantly smaller, but that phrase |
180 | * searching and NEAR won't be supported. |
181 | * |
182 | * @param text The text to index. |
183 | * @param wdf_inc The wdf increment (default 1). |
184 | * @param prefix The term prefix to use (default is no prefix). |
185 | */ |
186 | void index_text_without_positions(const std::string & text, |
187 | Xapian::termcount wdf_inc = 1, |
188 | const std::string & prefix = std::string()) { |
189 | return index_text_without_positions(Utf8Iterator(text), wdf_inc, prefix); |
190 | } |
191 | |
192 | /** Increase the term position used by index_text. |
193 | * |
194 | * This can be used between indexing text from different fields or other |
195 | * places to prevent phrase searches from spanning between them (e.g. |
196 | * between the title and body text, or between two chapters in a book). |
197 | * |
198 | * @param delta Amount to increase the term position by (default: 100). |
199 | */ |
200 | void increase_termpos(Xapian::termcount delta = 100); |
201 | |
202 | /// Get the current term position. |
203 | Xapian::termcount get_termpos() const; |
204 | |
205 | /** Set the current term position. |
206 | * |
207 | * @param termpos The new term position to set. |
208 | */ |
209 | void set_termpos(Xapian::termcount termpos); |
210 | |
211 | /// Return a string describing this object. |
212 | std::string get_description() const; |
213 | }; |
214 | |
215 | } |
216 | |
217 | #endif // XAPIAN_INCLUDED_TERMGENERATOR_H |
218 | |