1/** @file matchspy.h
2 * @brief MatchSpy implementation.
3 */
4/* Copyright (C) 2007,2008,2009,2010,2012 Olly Betts
5 * Copyright (C) 2007,2009 Lemur Consulting Ltd
6 * Copyright (C) 2010 Richard Boulton
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#ifndef XAPIAN_INCLUDED_MATCHSPY_H
24#define XAPIAN_INCLUDED_MATCHSPY_H
25
26#include <xapian/base.h>
27#include <xapian/enquire.h>
28#include <xapian/termiterator.h>
29#include <xapian/visibility.h>
30
31#include <string>
32#include <map>
33#include <set>
34#include <string>
35#include <vector>
36
37namespace Xapian {
38
39class Document;
40class Registry;
41
42/** Abstract base class for match spies.
43 *
44 * The subclasses will generally accumulate information seen during the match,
45 * to calculate aggregate functions, or other profiles of the matching
46 * documents.
47 */
48class XAPIAN_VISIBILITY_DEFAULT MatchSpy {
49 private:
50 /// Don't allow assignment.
51 void operator=(const MatchSpy &);
52
53 /// Don't allow copying.
54 MatchSpy(const MatchSpy &);
55
56 protected:
57 /// Default constructor, needed by subclass constructors.
58 MatchSpy() {}
59
60 public:
61 /** Virtual destructor, because we have virtual methods. */
62 virtual ~MatchSpy();
63
64 /** Register a document with the match spy.
65 *
66 * This is called by the matcher once with each document seen by the
67 * matcher during the match process. Note that the matcher will often not
68 * see all the documents which match the query, due to optimisations which
69 * allow low-weighted documents to be skipped, and allow the match process
70 * to be terminated early.
71 *
72 * @param doc The document seen by the match spy.
73 * @param wt The weight of the document.
74 */
75 virtual void operator()(const Xapian::Document &doc,
76 Xapian::weight wt) = 0;
77
78 /** Clone the match spy.
79 *
80 * The clone should inherit the configuration of the parent, but need not
81 * inherit the state. ie, the clone does not need to be passed
82 * information about the results seen by the parent.
83 *
84 * If you don't want to support the remote backend in your match spy, you
85 * can use the default implementation which simply throws
86 * Xapian::UnimplementedError.
87 *
88 * Note that the returned object will be deallocated by Xapian after use
89 * with "delete". If you want to handle the deletion in a special way
90 * (for example when wrapping the Xapian API for use from another
91 * language) then you can define a static <code>operator delete</code>
92 * method in your subclass as shown here:
93 * http://trac.xapian.org/ticket/554#comment:1
94 */
95 virtual MatchSpy * clone() const;
96
97 /** Return the name of this match spy.
98 *
99 * This name is used by the remote backend. It is passed with the
100 * serialised parameters to the remote server so that it knows which class
101 * to create.
102 *
103 * Return the full namespace-qualified name of your class here - if your
104 * class is called MyApp::FooMatchSpy, return "MyApp::FooMatchSpy" from
105 * this method.
106 *
107 * If you don't want to support the remote backend in your match spy, you
108 * can use the default implementation which simply throws
109 * Xapian::UnimplementedError.
110 */
111 virtual std::string name() const;
112
113 /** Return this object's parameters serialised as a single string.
114 *
115 * If you don't want to support the remote backend in your match spy, you
116 * can use the default implementation which simply throws
117 * Xapian::UnimplementedError.
118 */
119 virtual std::string serialise() const;
120
121 /** Unserialise parameters.
122 *
123 * This method unserialises parameters serialised by the @a serialise()
124 * method and allocates and returns a new object initialised with them.
125 *
126 * If you don't want to support the remote backend in your match spy, you
127 * can use the default implementation which simply throws
128 * Xapian::UnimplementedError.
129 *
130 * Note that the returned object will be deallocated by Xapian after use
131 * with "delete". If you want to handle the deletion in a special way
132 * (for example when wrapping the Xapian API for use from another
133 * language) then you can define a static <code>operator delete</code>
134 * method in your subclass as shown here:
135 * http://trac.xapian.org/ticket/554#comment:1
136 *
137 * @param s A string containing the serialised results.
138 * @param context Registry object to use for unserialisation to permit
139 * MatchSpy subclasses with sub-MatchSpy objects to be
140 * implemented.
141 */
142 virtual MatchSpy * unserialise(const std::string & s,
143 const Registry & context) const;
144
145 /** Serialise the results of this match spy.
146 *
147 * If you don't want to support the remote backend in your match spy, you
148 * can use the default implementation which simply throws
149 * Xapian::UnimplementedError.
150 */
151 virtual std::string serialise_results() const;
152
153 /** Unserialise some results, and merge them into this matchspy.
154 *
155 * The order in which results are merged should not be significant, since
156 * this order is not specified (and will vary depending on the speed of
157 * the search in each sub-database).
158 *
159 * If you don't want to support the remote backend in your match spy, you
160 * can use the default implementation which simply throws
161 * Xapian::UnimplementedError.
162 *
163 * @param s A string containing the serialised results.
164 */
165 virtual void merge_results(const std::string & s);
166
167 /** Return a string describing this object.
168 *
169 * This default implementation returns a generic answer, to avoid forcing
170 * those deriving their own MatchSpy subclasses from having to implement
171 * this (they may not care what get_description() gives for their
172 * subclass).
173 */
174 virtual std::string get_description() const;
175};
176
177
178/** Class for counting the frequencies of values in the matching documents.
179 */
180class XAPIAN_VISIBILITY_DEFAULT ValueCountMatchSpy : public MatchSpy {
181 public:
182 struct Internal;
183
184#ifndef SWIG // SWIG doesn't need to know about the internal class
185 struct XAPIAN_VISIBILITY_DEFAULT Internal
186 : public Xapian::Internal::RefCntBase
187 {
188 /// The slot to count.
189 Xapian::valueno slot;
190
191 /// Total number of documents seen by the match spy.
192 Xapian::doccount total;
193
194 /// The values seen so far, together with their frequency.
195 std::map<std::string, Xapian::doccount> values;
196
197 Internal() : slot(Xapian::BAD_VALUENO), total(0) {}
198 Internal(Xapian::valueno slot_) : slot(slot_), total(0) {}
199 };
200#endif
201
202 protected:
203 Xapian::Internal::RefCntPtr<Internal> internal;
204
205 public:
206 /// Construct an empty ValueCountMatchSpy.
207 ValueCountMatchSpy() : internal() {}
208
209 /// Construct a MatchSpy which counts the values in a particular slot.
210 ValueCountMatchSpy(Xapian::valueno slot_)
211 : internal(new Internal(slot_)) {}
212
213 /** Return the total number of documents tallied. */
214 size_t get_total() const {
215 return internal.get() ? internal->total : 0;
216 }
217
218 /** Get an iterator over the values seen in the slot.
219 *
220 * Items will be returned in ascending alphabetical order.
221 *
222 * During the iteration, the frequency of the current value can be
223 * obtained with the get_termfreq() method on the iterator.
224 */
225 TermIterator values_begin() const;
226
227 /** End iterator corresponding to values_begin() */
228 TermIterator values_end() const {
229 return TermIterator();
230 }
231
232 /** Get an iterator over the most frequent values seen in the slot.
233 *
234 * Items will be returned in descending order of frequency. Values with
235 * the same frequency will be returned in ascending alphabetical order.
236 *
237 * During the iteration, the frequency of the current value can be
238 * obtained with the get_termfreq() method on the iterator.
239 *
240 * @param maxvalues The maximum number of values to return.
241 */
242 TermIterator top_values_begin(size_t maxvalues) const;
243
244 /** End iterator corresponding to top_values_begin() */
245 TermIterator top_values_end(size_t) const {
246 return TermIterator();
247 }
248
249 /** Implementation of virtual operator().
250 *
251 * This implementation tallies values for a matching document.
252 *
253 * @param doc The document to tally values for.
254 * @param wt The weight of the document (ignored by this class).
255 */
256 void operator()(const Xapian::Document &doc, Xapian::weight wt);
257
258 virtual MatchSpy * clone() const;
259 virtual std::string name() const;
260 virtual std::string serialise() const;
261 virtual MatchSpy * unserialise(const std::string & s,
262 const Registry & context) const;
263 virtual std::string serialise_results() const;
264 virtual void merge_results(const std::string & s);
265 virtual std::string get_description() const;
266};
267
268}
269
270#endif // XAPIAN_INCLUDED_MATCHSPY_H
271