1 | /*------------------------------------------------------------------------------ |
2 | * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team |
3 | * |
4 | * Distributable under the terms of either the Apache License (Version 2.0) or |
5 | * the GNU Lesser General Public License, as specified in the COPYING file. |
6 | ------------------------------------------------------------------------------*/ |
7 | #ifndef _lucene_search_Similarity_ |
8 | #define _lucene_search_Similarity_ |
9 | |
10 | #if defined(_LUCENE_PRAGMA_ONCE) |
11 | # pragma once |
12 | #endif |
13 | |
14 | #include "CLucene/index/Term.h" |
15 | |
16 | CL_NS_DEF(search) |
17 | |
18 | class Searcher;//save including the searchheader.h |
19 | class DefaultSimilarity; |
20 | |
21 | /** Expert: Scoring API. |
22 | * <p>Subclasses implement search scoring. |
23 | * |
24 | * <p>The score of query <code>q</code> for document <code>d</code> is defined |
25 | * in terms of these methods as follows: |
26 | * |
27 | * <table cellpadding="0" cellspacing="0" border="0"> |
28 | * <tr> |
29 | * <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td> |
30 | * <td valign="middle" align="center"> |
31 | * <big><big><big><big><big>Σ</big></big></big></big></big></td> |
32 | * <td valign="middle"><small> |
33 | * {@link #tf(int32_t) tf}(t in d) * |
34 | * {@link #idf(Term,Searcher) idf}(t) * |
35 | * {@link Field#getBoost getBoost}(t.field in d) * |
36 | * {@link #lengthNorm(TCHAR*,int32_t) lengthNorm}(t.field in d) |
37 | * </small></td> |
38 | * <td valign="middle" rowspan="2"> * |
39 | * {@link #coord(int32_t,int32_t) coord}(q,d) * |
40 | * {@link #queryNorm(qreal) queryNorm}(q) |
41 | * </td> |
42 | * </tr> |
43 | * <tr> |
44 | * <td valign="top" align="right"> |
45 | * <small>t in q</small> |
46 | * </td> |
47 | * </tr> |
48 | * </table> |
49 | * |
50 | * @see #setDefault(Similarity) |
51 | * @see IndexWriter#setSimilarity(Similarity) |
52 | * @see Searcher#setSimilarity(Similarity) |
53 | */ |
54 | class Similarity:LUCENE_BASE { |
55 | public: |
56 | virtual ~Similarity(); |
57 | |
58 | /** Set the default Similarity implementation used by indexing and search |
59 | * code. |
60 | * |
61 | * @see Searcher#setSimilarity(Similarity) |
62 | * @see IndexWriter#setSimilarity(Similarity) |
63 | */ |
64 | static void setDefault(Similarity* similarity); |
65 | |
66 | /** Return the default Similarity implementation used by indexing and search |
67 | * code. |
68 | * |
69 | * <p>This is initially an instance of {@link DefaultSimilarity}. |
70 | * |
71 | * @see Searcher#setSimilarity(Similarity) |
72 | * @see IndexWriter#setSimilarity(Similarity) |
73 | */ |
74 | static Similarity* getDefault(); |
75 | |
76 | /** Encodes a normalization factor for storage in an index. |
77 | * |
78 | * <p>The encoding uses a five-bit exponent and three-bit mantissa, thus |
79 | * representing values from around 7x10^9 to 2x10^-9 with about one |
80 | * significant decimal digit of accuracy. Zero is also represented. |
81 | * Negative numbers are rounded up to zero. Values too large to represent |
82 | * are rounded down to the largest representable value. Positive values too |
83 | * small to represent are rounded up to the smallest positive representable |
84 | * value. |
85 | * |
86 | * @see Field#setBoost(qreal) |
87 | */ |
88 | static uint8_t encodeNorm(qreal f); |
89 | |
90 | /** Decodes a normalization factor stored in an index. |
91 | * @see #encodeNorm(qreal) |
92 | */ |
93 | static qreal decodeNorm(uint8_t b); |
94 | |
95 | static uint8_t floatToByte(qreal f); |
96 | static qreal byteToFloat(uint8_t b); |
97 | |
98 | /** Computes a score factor for a phrase. |
99 | * |
100 | * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor |
101 | * for each term in the phrase. |
102 | * |
103 | * @param terms the terms in the phrase |
104 | * @param searcher the document collection being searched |
105 | * @return a score factor for the phrase |
106 | */ |
107 | qreal idf(CL_NS(util)::CLVector<CL_NS(index)::Term*>* terms, Searcher* searcher); |
108 | //qreal idf(Term** terms, Searcher* searcher); |
109 | |
110 | |
111 | /** Computes a score factor for a simple term. |
112 | * |
113 | * <p>The default implementation is:<pre> |
114 | * return idf(searcher.docFreq(term), searcher.maxDoc()); |
115 | * </pre> |
116 | * |
117 | * Note that {@link Searcher#maxDoc()} is used instead of |
118 | * {@link IndexReader#numDocs()} because it is proportional to |
119 | * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate, |
120 | * so is the other, and in the same direction. |
121 | * |
122 | * @param term the term in question |
123 | * @param searcher the document collection being searched |
124 | * @return a score factor for the term |
125 | */ |
126 | qreal idf(CL_NS(index)::Term* term, Searcher* searcher); |
127 | |
128 | |
129 | /** Computes a score factor based on a term or phrase's frequency in a |
130 | * document. This value is multiplied by the {@link #idf(Term, Searcher)} |
131 | * factor for each term in the query and these products are then summed to |
132 | * form the initial score for a document. |
133 | * |
134 | * <p>Terms and phrases repeated in a document indicate the topic of the |
135 | * document, so implementations of this method usually return larger values |
136 | * when <code>freq</code> is large, and smaller values when <code>freq</code> |
137 | * is small. |
138 | * |
139 | * <p>The default implementation calls {@link #tf(qreal)}. |
140 | * |
141 | * @param freq the frequency of a term within a document |
142 | * @return a score factor based on a term's within-document frequency |
143 | */ |
144 | inline qreal tf(int32_t freq){ return tf((qreal)freq); } |
145 | |
146 | /** Computes the normalization value for a field given the total number of |
147 | * terms contained in a field. These values, together with field boosts, are |
148 | * stored in an index and multipled into scores for hits on each field by the |
149 | * search code. |
150 | * |
151 | * <p>Matches in longer fields are less precise, so implemenations of this |
152 | * method usually return smaller values when <code>numTokens</code> is large, |
153 | * and larger values when <code>numTokens</code> is small. |
154 | * |
155 | * <p>That these values are computed under {@link |
156 | * IndexWriter#addDocument(Document)} and stored then using |
157 | * {#encodeNorm(qreal)}. Thus they have limited precision, and documents |
158 | * must be re-indexed if this method is altered. |
159 | * |
160 | * @param fieldName the name of the field |
161 | * @param numTokens the total number of tokens contained in fields named |
162 | * <i>fieldName</i> of <i>doc</i>. |
163 | * @return a normalization factor for hits on this field of this document |
164 | * |
165 | * @see Field#setBoost(qreal) |
166 | */ |
167 | virtual qreal lengthNorm(const TCHAR* fieldName, int32_t numTokens) = 0; |
168 | |
169 | /** Computes the normalization value for a query given the sum of the squared |
170 | * weights of each of the query terms. This value is then multipled into the |
171 | * weight of each query term. |
172 | * |
173 | * <p>This does not affect ranking, but rather just attempts to make scores |
174 | * from different queries comparable. |
175 | * |
176 | * @param sumOfSquaredWeights the sum of the squares of query term weights |
177 | * @return a normalization factor for query weights |
178 | */ |
179 | virtual qreal queryNorm(qreal sumOfSquaredWeights) = 0; |
180 | |
181 | /** Computes the amount of a sloppy phrase match, based on an edit distance. |
182 | * This value is summed for each sloppy phrase match in a document to form |
183 | * the frequency that is passed to {@link #tf(qreal)}. |
184 | * |
185 | * <p>A phrase match with a small edit distance to a document passage more |
186 | * closely matches the document, so implementations of this method usually |
187 | * return larger values when the edit distance is small and smaller values |
188 | * when it is large. |
189 | * |
190 | * @see PhraseQuery#setSlop(int32_t) |
191 | * @param distance the edit distance of this sloppy phrase match |
192 | * @return the frequency increment for this match |
193 | */ |
194 | virtual qreal sloppyFreq(int32_t distance) = 0; |
195 | |
196 | /** Computes a score factor based on a term or phrase's frequency in a |
197 | * document. This value is multiplied by the {@link #idf(Term, Searcher)} |
198 | * factor for each term in the query and these products are then summed to |
199 | * form the initial score for a document. |
200 | * |
201 | * <p>Terms and phrases repeated in a document indicate the topic of the |
202 | * document, so implemenations of this method usually return larger values |
203 | * when <code>freq</code> is large, and smaller values when <code>freq</code> |
204 | * is small. |
205 | * |
206 | * @param freq the frequency of a term within a document |
207 | * @return a score factor based on a term's within-document frequency |
208 | */ |
209 | virtual qreal tf(qreal freq) = 0; |
210 | |
211 | /** Computes a score factor based on a term's document frequency (the number |
212 | * of documents which contain the term). This value is multiplied by the |
213 | * {@link #tf(int32_t)} factor for each term in the query and these products are |
214 | * then summed to form the initial score for a document. |
215 | * |
216 | * <p>Terms that occur in fewer documents are better indicators of topic, so |
217 | * implemenations of this method usually return larger values for rare terms, |
218 | * and smaller values for common terms. |
219 | * |
220 | * @param docFreq the number of documents which contain the term |
221 | * @param numDocs the total number of documents in the collection |
222 | * @return a score factor based on the term's document frequency |
223 | */ |
224 | virtual qreal idf(int32_t docFreq, int32_t numDocs) = 0; |
225 | |
226 | /** Computes a score factor based on the fraction of all query terms that a |
227 | * document contains. This value is multiplied into scores. |
228 | * |
229 | * <p>The presence of a large portion of the query terms indicates a better |
230 | * match with the query, so implemenations of this method usually return |
231 | * larger values when the ratio between these parameters is large and smaller |
232 | * values when the ratio between them is small. |
233 | * |
234 | * @param overlap the number of query terms matched in the document |
235 | * @param maxOverlap the total number of terms in the query |
236 | * @return a score factor based on term overlap with the query |
237 | */ |
238 | virtual qreal coord(int32_t overlap, int32_t maxOverlap) = 0; |
239 | }; |
240 | |
241 | |
242 | /** Expert: Default scoring implementation. */ |
243 | class DefaultSimilarity: public Similarity { |
244 | public: |
245 | DefaultSimilarity(); |
246 | ~DefaultSimilarity(); |
247 | |
248 | /** Implemented as <code>1/sqrt(numTerms)</code>. */ |
249 | qreal lengthNorm(const TCHAR* fieldName, int32_t numTerms); |
250 | |
251 | /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */ |
252 | qreal queryNorm(qreal sumOfSquaredWeights); |
253 | |
254 | /** Implemented as <code>sqrt(freq)</code>. */ |
255 | inline qreal tf(qreal freq); |
256 | |
257 | /** Implemented as <code>1 / (distance + 1)</code>. */ |
258 | qreal sloppyFreq(int32_t distance); |
259 | |
260 | /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */ |
261 | qreal idf(int32_t docFreq, int32_t numDocs); |
262 | |
263 | /** Implemented as <code>overlap / maxOverlap</code>. */ |
264 | qreal coord(int32_t overlap, int32_t maxOverlap); |
265 | }; |
266 | |
267 | CL_NS_END |
268 | #endif |
269 | |