1/** @file termgenerator.h
2 * @brief parse free text and generate terms
3 */
4/* Copyright (C) 2007,2009,2011,2012 Olly Betts
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#ifndef XAPIAN_INCLUDED_TERMGENERATOR_H
22#define XAPIAN_INCLUDED_TERMGENERATOR_H
23
24#include <xapian/base.h>
25#include <xapian/types.h>
26#include <xapian/unicode.h>
27#include <xapian/visibility.h>
28
29#include <string>
30
31namespace Xapian {
32
33class Document;
34class Stem;
35class Stopper;
36class WritableDatabase;
37
38/** Parses a piece of text and generate terms.
39 *
40 * This module takes a piece of text and parses it to produce words which are
41 * then used to generate suitable terms for indexing. The terms generated are
42 * suitable for use with Query objects produced by the QueryParser class.
43 */
44class XAPIAN_VISIBILITY_DEFAULT TermGenerator {
45 public:
46 /// @private @internal Class representing the TermGenerator internals.
47 class Internal;
48 /// @private @internal Reference counted internals.
49 Xapian::Internal::RefCntPtr<Internal> internal;
50
51 /// Copy constructor.
52 TermGenerator(const TermGenerator & o);
53
54 /// Assignment.
55 TermGenerator & operator=(const TermGenerator & o);
56
57 /// Default constructor.
58 TermGenerator();
59
60 /// Destructor.
61 ~TermGenerator();
62
63 /// Set the Xapian::Stem object to be used for generating stemmed terms.
64 void set_stemmer(const Xapian::Stem & stemmer);
65
66 /** Set the Xapian::Stopper object to be used for identifying stopwords.
67 *
68 * Stemmed forms of stopwords aren't indexed, but unstemmed forms still
69 * are so that searches for phrases including stop words still work.
70 *
71 * @param stop The Stopper object to set (default NULL, which means no
72 * stopwords).
73 */
74 void set_stopper(const Xapian::Stopper *stop = NULL);
75
76 /// Set the current document.
77 void set_document(const Xapian::Document & doc);
78
79 /// Get the current document.
80 const Xapian::Document & get_document() const;
81
82 /// Set the database to index spelling data to.
83 void set_database(const Xapian::WritableDatabase &db);
84
85 /// Flags to OR together and pass to TermGenerator::set_flags().
86 enum flags {
87 /// Index data required for spelling correction.
88 FLAG_SPELLING = 128 // Value matches QueryParser flag.
89 };
90
91 /// Stemming strategies, for use with set_stemming_strategy().
92 typedef enum { STEM_NONE, STEM_SOME, STEM_ALL, STEM_ALL_Z } stem_strategy;
93
94 /** Set flags.
95 *
96 * The new value of flags is: (flags & mask) ^ toggle
97 *
98 * To just set the flags, pass the new flags in toggle and the
99 * default value for mask.
100 *
101 * @param toggle Flags to XOR.
102 * @param mask Flags to AND with first.
103 *
104 * @return The old flags setting.
105 */
106 flags set_flags(flags toggle, flags mask = flags(0));
107
108 /** Set the stemming strategy.
109 *
110 * This method controls how the stemming algorithm is applied. It was
111 * new in Xapian 1.3.1.
112 *
113 * @param strategy The strategy to use - possible values are:
114 * - STEM_NONE: Don't perform any stemming - only unstemmed terms
115 * are generated.
116 * - STEM_SOME: Generate both stemmed (with a "Z" prefix) and unstemmed
117 * terms. This is the default strategy.
118 * - STEM_ALL: Generate only stemmed terms (but without a "Z" prefix).
119 * - STEM_ALL_Z: Generate only stemmed terms (with a "Z" prefix).
120 */
121 void set_stemming_strategy(stem_strategy strategy);
122
123 /** Set the maximum length word to index.
124 *
125 * The limit is on the length of a word prior to stemming and prior to
126 * adding any term prefix.
127 *
128 * The backends mostly impose a limit on the length of terms (often of
129 * about 240 bytes), but it's generally useful to have a lower limit to
130 * help prevent the index being bloated by useless junk terms from trying
131 * to indexing things like binary data, uuencoded data, ASCII art, etc.
132 *
133 * This method was new in Xapian 1.3.1.
134 *
135 * @param max_word_length The maximum length word to index, in bytes in
136 * UTF-8 representation. Default is 64.
137 */
138 void set_max_word_length(unsigned max_word_length);
139
140 /** Index some text.
141 *
142 * @param itor Utf8Iterator pointing to the text to index.
143 * @param wdf_inc The wdf increment (default 1).
144 * @param prefix The term prefix to use (default is no prefix).
145 */
146 void index_text(const Xapian::Utf8Iterator & itor,
147 Xapian::termcount wdf_inc = 1,
148 const std::string & prefix = std::string());
149
150 /** Index some text in a std::string.
151 *
152 * @param text The text to index.
153 * @param wdf_inc The wdf increment (default 1).
154 * @param prefix The term prefix to use (default is no prefix).
155 */
156 void index_text(const std::string & text,
157 Xapian::termcount wdf_inc = 1,
158 const std::string & prefix = std::string()) {
159 return index_text(Utf8Iterator(text), wdf_inc, prefix);
160 }
161
162 /** Index some text without positional information.
163 *
164 * Just like index_text, but no positional information is generated. This
165 * means that the database will be significantly smaller, but that phrase
166 * searching and NEAR won't be supported.
167 *
168 * @param itor Utf8Iterator pointing to the text to index.
169 * @param wdf_inc The wdf increment (default 1).
170 * @param prefix The term prefix to use (default is no prefix).
171 */
172 void index_text_without_positions(const Xapian::Utf8Iterator & itor,
173 Xapian::termcount wdf_inc = 1,
174 const std::string & prefix = std::string());
175
176 /** Index some text in a std::string without positional information.
177 *
178 * Just like index_text, but no positional information is generated. This
179 * means that the database will be significantly smaller, but that phrase
180 * searching and NEAR won't be supported.
181 *
182 * @param text The text to index.
183 * @param wdf_inc The wdf increment (default 1).
184 * @param prefix The term prefix to use (default is no prefix).
185 */
186 void index_text_without_positions(const std::string & text,
187 Xapian::termcount wdf_inc = 1,
188 const std::string & prefix = std::string()) {
189 return index_text_without_positions(Utf8Iterator(text), wdf_inc, prefix);
190 }
191
192 /** Increase the term position used by index_text.
193 *
194 * This can be used between indexing text from different fields or other
195 * places to prevent phrase searches from spanning between them (e.g.
196 * between the title and body text, or between two chapters in a book).
197 *
198 * @param delta Amount to increase the term position by (default: 100).
199 */
200 void increase_termpos(Xapian::termcount delta = 100);
201
202 /// Get the current term position.
203 Xapian::termcount get_termpos() const;
204
205 /** Set the current term position.
206 *
207 * @param termpos The new term position to set.
208 */
209 void set_termpos(Xapian::termcount termpos);
210
211 /// Return a string describing this object.
212 std::string get_description() const;
213};
214
215}
216
217#endif // XAPIAN_INCLUDED_TERMGENERATOR_H
218