1/*------------------------------------------------------------------------------
2* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3*
4* Distributable under the terms of either the Apache License (Version 2.0) or
5* the GNU Lesser General Public License, as specified in the COPYING file.
6------------------------------------------------------------------------------*/
7#ifndef _lucene_search_Similarity_
8#define _lucene_search_Similarity_
9
10#if defined(_LUCENE_PRAGMA_ONCE)
11# pragma once
12#endif
13
14#include "CLucene/index/Term.h"
15
16CL_NS_DEF(search)
17
18class Searcher;//save including the searchheader.h
19class DefaultSimilarity;
20
21/** Expert: Scoring API.
22* <p>Subclasses implement search scoring.
23*
24* <p>The score of query <code>q</code> for document <code>d</code> is defined
25* in terms of these methods as follows:
26*
27* <table cellpadding="0" cellspacing="0" border="0">
28* <tr>
29* <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
30* <td valign="middle" align="center">
31* <big><big><big><big><big>&Sigma;</big></big></big></big></big></td>
32* <td valign="middle"><small>
33* {@link #tf(int32_t) tf}(t in d) *
34* {@link #idf(Term,Searcher) idf}(t) *
35* {@link Field#getBoost getBoost}(t.field in d) *
36* {@link #lengthNorm(TCHAR*,int32_t) lengthNorm}(t.field in d)
37* </small></td>
38* <td valign="middle" rowspan="2">&nbsp;*
39* {@link #coord(int32_t,int32_t) coord}(q,d) *
40* {@link #queryNorm(qreal) queryNorm}(q)
41* </td>
42* </tr>
43* <tr>
44* <td valign="top" align="right">
45* <small>t in q</small>
46* </td>
47* </tr>
48* </table>
49*
50* @see #setDefault(Similarity)
51* @see IndexWriter#setSimilarity(Similarity)
52* @see Searcher#setSimilarity(Similarity)
53*/
54class Similarity:LUCENE_BASE {
55public:
56 virtual ~Similarity();
57
58 /** Set the default Similarity implementation used by indexing and search
59 * code.
60 *
61 * @see Searcher#setSimilarity(Similarity)
62 * @see IndexWriter#setSimilarity(Similarity)
63 */
64 static void setDefault(Similarity* similarity);
65
66 /** Return the default Similarity implementation used by indexing and search
67 * code.
68 *
69 * <p>This is initially an instance of {@link DefaultSimilarity}.
70 *
71 * @see Searcher#setSimilarity(Similarity)
72 * @see IndexWriter#setSimilarity(Similarity)
73 */
74 static Similarity* getDefault();
75
76 /** Encodes a normalization factor for storage in an index.
77 *
78 * <p>The encoding uses a five-bit exponent and three-bit mantissa, thus
79 * representing values from around 7x10^9 to 2x10^-9 with about one
80 * significant decimal digit of accuracy. Zero is also represented.
81 * Negative numbers are rounded up to zero. Values too large to represent
82 * are rounded down to the largest representable value. Positive values too
83 * small to represent are rounded up to the smallest positive representable
84 * value.
85 *
86 * @see Field#setBoost(qreal)
87 */
88 static uint8_t encodeNorm(qreal f);
89
90 /** Decodes a normalization factor stored in an index.
91 * @see #encodeNorm(qreal)
92 */
93 static qreal decodeNorm(uint8_t b);
94
95 static uint8_t floatToByte(qreal f);
96 static qreal byteToFloat(uint8_t b);
97
98 /** Computes a score factor for a phrase.
99 *
100 * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
101 * for each term in the phrase.
102 *
103 * @param terms the terms in the phrase
104 * @param searcher the document collection being searched
105 * @return a score factor for the phrase
106 */
107 qreal idf(CL_NS(util)::CLVector<CL_NS(index)::Term*>* terms, Searcher* searcher);
108 //qreal idf(Term** terms, Searcher* searcher);
109
110
111 /** Computes a score factor for a simple term.
112 *
113 * <p>The default implementation is:<pre>
114 * return idf(searcher.docFreq(term), searcher.maxDoc());
115 * </pre>
116 *
117 * Note that {@link Searcher#maxDoc()} is used instead of
118 * {@link IndexReader#numDocs()} because it is proportional to
119 * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
120 * so is the other, and in the same direction.
121 *
122 * @param term the term in question
123 * @param searcher the document collection being searched
124 * @return a score factor for the term
125 */
126 qreal idf(CL_NS(index)::Term* term, Searcher* searcher);
127
128
129 /** Computes a score factor based on a term or phrase's frequency in a
130 * document. This value is multiplied by the {@link #idf(Term, Searcher)}
131 * factor for each term in the query and these products are then summed to
132 * form the initial score for a document.
133 *
134 * <p>Terms and phrases repeated in a document indicate the topic of the
135 * document, so implementations of this method usually return larger values
136 * when <code>freq</code> is large, and smaller values when <code>freq</code>
137 * is small.
138 *
139 * <p>The default implementation calls {@link #tf(qreal)}.
140 *
141 * @param freq the frequency of a term within a document
142 * @return a score factor based on a term's within-document frequency
143 */
144 inline qreal tf(int32_t freq){ return tf((qreal)freq); }
145
146 /** Computes the normalization value for a field given the total number of
147 * terms contained in a field. These values, together with field boosts, are
148 * stored in an index and multipled into scores for hits on each field by the
149 * search code.
150 *
151 * <p>Matches in longer fields are less precise, so implemenations of this
152 * method usually return smaller values when <code>numTokens</code> is large,
153 * and larger values when <code>numTokens</code> is small.
154 *
155 * <p>That these values are computed under {@link
156 * IndexWriter#addDocument(Document)} and stored then using
157 * {#encodeNorm(qreal)}. Thus they have limited precision, and documents
158 * must be re-indexed if this method is altered.
159 *
160 * @param fieldName the name of the field
161 * @param numTokens the total number of tokens contained in fields named
162 * <i>fieldName</i> of <i>doc</i>.
163 * @return a normalization factor for hits on this field of this document
164 *
165 * @see Field#setBoost(qreal)
166 */
167 virtual qreal lengthNorm(const TCHAR* fieldName, int32_t numTokens) = 0;
168
169 /** Computes the normalization value for a query given the sum of the squared
170 * weights of each of the query terms. This value is then multipled into the
171 * weight of each query term.
172 *
173 * <p>This does not affect ranking, but rather just attempts to make scores
174 * from different queries comparable.
175 *
176 * @param sumOfSquaredWeights the sum of the squares of query term weights
177 * @return a normalization factor for query weights
178 */
179 virtual qreal queryNorm(qreal sumOfSquaredWeights) = 0;
180
181 /** Computes the amount of a sloppy phrase match, based on an edit distance.
182 * This value is summed for each sloppy phrase match in a document to form
183 * the frequency that is passed to {@link #tf(qreal)}.
184 *
185 * <p>A phrase match with a small edit distance to a document passage more
186 * closely matches the document, so implementations of this method usually
187 * return larger values when the edit distance is small and smaller values
188 * when it is large.
189 *
190 * @see PhraseQuery#setSlop(int32_t)
191 * @param distance the edit distance of this sloppy phrase match
192 * @return the frequency increment for this match
193 */
194 virtual qreal sloppyFreq(int32_t distance) = 0;
195
196 /** Computes a score factor based on a term or phrase's frequency in a
197 * document. This value is multiplied by the {@link #idf(Term, Searcher)}
198 * factor for each term in the query and these products are then summed to
199 * form the initial score for a document.
200 *
201 * <p>Terms and phrases repeated in a document indicate the topic of the
202 * document, so implemenations of this method usually return larger values
203 * when <code>freq</code> is large, and smaller values when <code>freq</code>
204 * is small.
205 *
206 * @param freq the frequency of a term within a document
207 * @return a score factor based on a term's within-document frequency
208 */
209 virtual qreal tf(qreal freq) = 0;
210
211 /** Computes a score factor based on a term's document frequency (the number
212 * of documents which contain the term). This value is multiplied by the
213 * {@link #tf(int32_t)} factor for each term in the query and these products are
214 * then summed to form the initial score for a document.
215 *
216 * <p>Terms that occur in fewer documents are better indicators of topic, so
217 * implemenations of this method usually return larger values for rare terms,
218 * and smaller values for common terms.
219 *
220 * @param docFreq the number of documents which contain the term
221 * @param numDocs the total number of documents in the collection
222 * @return a score factor based on the term's document frequency
223 */
224 virtual qreal idf(int32_t docFreq, int32_t numDocs) = 0;
225
226 /** Computes a score factor based on the fraction of all query terms that a
227 * document contains. This value is multiplied into scores.
228 *
229 * <p>The presence of a large portion of the query terms indicates a better
230 * match with the query, so implemenations of this method usually return
231 * larger values when the ratio between these parameters is large and smaller
232 * values when the ratio between them is small.
233 *
234 * @param overlap the number of query terms matched in the document
235 * @param maxOverlap the total number of terms in the query
236 * @return a score factor based on term overlap with the query
237 */
238 virtual qreal coord(int32_t overlap, int32_t maxOverlap) = 0;
239};
240
241
242/** Expert: Default scoring implementation. */
243class DefaultSimilarity: public Similarity {
244public:
245 DefaultSimilarity();
246 ~DefaultSimilarity();
247
248 /** Implemented as <code>1/sqrt(numTerms)</code>. */
249 qreal lengthNorm(const TCHAR* fieldName, int32_t numTerms);
250
251 /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */
252 qreal queryNorm(qreal sumOfSquaredWeights);
253
254 /** Implemented as <code>sqrt(freq)</code>. */
255 inline qreal tf(qreal freq);
256
257 /** Implemented as <code>1 / (distance + 1)</code>. */
258 qreal sloppyFreq(int32_t distance);
259
260 /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
261 qreal idf(int32_t docFreq, int32_t numDocs);
262
263 /** Implemented as <code>overlap / maxOverlap</code>. */
264 qreal coord(int32_t overlap, int32_t maxOverlap);
265};
266
267CL_NS_END
268#endif
269