1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the Qt Linguist of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:GPL-EXCEPT$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU
19** General Public License version 3 as published by the Free Software
20** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
21** included in the packaging of this file. Please review the following
22** information to ensure the GNU General Public License requirements will
23** be met: https://www.gnu.org/licenses/gpl-3.0.html.
24**
25** $QT_END_LICENSE$
26**
27****************************************************************************/
28
29#include "simtexth.h"
30#include "translator.h"
31
32#include <QtCore/QByteArray>
33#include <QtCore/QString>
34#include <QtCore/QList>
35
36
37QT_BEGIN_NAMESPACE
38
39typedef QList<TranslatorMessage> TML;
40
41/*
42 How similar are two texts? The approach used here relies on co-occurrence
43 matrices and is very efficient.
44
45 Let's see with an example: how similar are "here" and "hither"? The
46 co-occurrence matrix M for "here" is M[h,e] = 1, M[e,r] = 1, M[r,e] = 1, and 0
47 elsewhere; the matrix N for "hither" is N[h,i] = 1, N[i,t] = 1, ...,
48 N[h,e] = 1, N[e,r] = 1, and 0 elsewhere. The union U of both matrices is the
49 matrix U[i,j] = max { M[i,j], N[i,j] }, and the intersection V is
50 V[i,j] = min { M[i,j], N[i,j] }. The score for a pair of texts is
51
52 score = (sum of V[i,j] over all i, j) / (sum of U[i,j] over all i, j),
53
54 a formula suggested by Arnt Gulbrandsen. Here we have
55
56 score = 2 / 6,
57
58 or one third.
59
60 The implementation differs from this in a few details. Most importantly,
61 repetitions are ignored; for input "xxx", M[x,x] equals 1, not 2.
62*/
63
64/*
65 Every character is assigned to one of 20 buckets so that the co-occurrence
66 matrix requires only 20 * 20 = 400 bits, not 256 * 256 = 65536 bits or even
67 more if we want the whole Unicode. Which character falls in which bucket is
68 arbitrary.
69
70 The second half of the table is a replica of the first half, because of
71 laziness.
72*/
73static const int indexOf[256] = {
74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76// ! " # $ % & ' ( ) * + , - . /
77 0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
78// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
79 1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
80// @ A B C D E F G H I J K L M N O
81 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
82// P Q R S T U V W X Y Z [ \ ] ^ _
83 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
84// ` a b c d e f g h i j k l m n o
85 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
86// p q r s t u v w x y z { | } ~
87 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
88
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91 0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
92 1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
93 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
94 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
95 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
96 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0
97};
98
99/*
100 The entry bitCount[i] (for i between 0 and 255) is the number of bits used to
101 represent i in binary.
102*/
103static const int bitCount[256] = {
104 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
105 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
106 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
107 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
108 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
109 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
110 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
111 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
112 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
113 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
114 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
115 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
116 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
117 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
118 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
119 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
120};
121
122static inline void setCoOccurence(CoMatrix &m, char c, char d)
123{
124 int k = indexOf[(uchar) c] + 20 * indexOf[(uchar) d];
125 m.b[k >> 3] |= (1 << (k & 0x7));
126}
127
128CoMatrix::CoMatrix(const QString &str)
129{
130 QByteArray ba = str.toUtf8();
131 const char *text = ba.constData();
132 char c = '\0', d;
133 memset( s: b, c: 0, n: 52 );
134 /*
135 The Knuth books are not in the office only for show; they help make
136 loops 30% faster and 20% as readable.
137 */
138 while ( (d = *text) != '\0' ) {
139 setCoOccurence(m&: *this, c, d);
140 if ( (c = *++text) != '\0' ) {
141 setCoOccurence(m&: *this, c: d, d: c);
142 text++;
143 }
144 }
145}
146
147static inline int worth(const CoMatrix &m)
148{
149 int w = 0;
150 for (int i = 0; i < 50; i++)
151 w += bitCount[m.b[i]];
152 return w;
153}
154
155static inline CoMatrix reunion(const CoMatrix &m, const CoMatrix &n)
156{
157 CoMatrix p;
158 for (int i = 0; i < 13; ++i)
159 p.w[i] = m.w[i] | n.w[i];
160 return p;
161}
162
163static inline CoMatrix intersection(const CoMatrix &m, const CoMatrix &n)
164{
165 CoMatrix p;
166 for (int i = 0; i < 13; ++i)
167 p.w[i] = m.w[i] & n.w[i];
168 return p;
169}
170
171StringSimilarityMatcher::StringSimilarityMatcher(const QString &stringToMatch)
172 : m_cm(stringToMatch)
173{
174 m_length = stringToMatch.length();
175}
176
177int StringSimilarityMatcher::getSimilarityScore(const QString &strCandidate)
178{
179 CoMatrix cmTarget(strCandidate);
180 int delta = qAbs(t: m_length - strCandidate.size());
181 int score = ( (worth(m: intersection(m: m_cm, n: cmTarget)) + 1) << 10 ) /
182 ( worth(m: reunion(m: m_cm, n: cmTarget)) + (delta << 1) + 1 );
183 return score;
184}
185
186CandidateList similarTextHeuristicCandidates(const Translator *tor,
187 const QString &text, int maxCandidates)
188{
189 QList<int> scores;
190 CandidateList candidates;
191 StringSimilarityMatcher matcher(text);
192
193 foreach (const TranslatorMessage &mtm, tor->messages()) {
194 if (mtm.type() == TranslatorMessage::Unfinished
195 || mtm.translation().isEmpty())
196 continue;
197
198 QString s = mtm.sourceText();
199 int score = matcher.getSimilarityScore(strCandidate: s);
200
201 if (candidates.size() == maxCandidates && score > scores[maxCandidates - 1] )
202 candidates.removeLast();
203
204 if (candidates.size() < maxCandidates && score >= textSimilarityThreshold) {
205 Candidate cand(mtm.context(), s, mtm.comment(), mtm.translation());
206
207 int i;
208 for (i = 0; i < candidates.size(); i++) {
209 if (score >= scores.at(i)) {
210 if (score == scores.at(i)) {
211 if (candidates.at(i) == cand)
212 goto continue_outer_loop;
213 } else {
214 break;
215 }
216 }
217 }
218 scores.insert(i, t: score);
219 candidates.insert(i, t: cand);
220 }
221 continue_outer_loop:
222 ;
223 }
224 return candidates;
225}
226
227QT_END_NAMESPACE
228

source code of qttools/src/linguist/shared/simtexth.cpp