1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the Qt Linguist of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:GPL-EXCEPT$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU
19** General Public License version 3 as published by the Free Software
20** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
21** included in the packaging of this file. Please review the following
22** information to ensure the GNU General Public License requirements will
23** be met: https://www.gnu.org/licenses/gpl-3.0.html.
24**
25** $QT_END_LICENSE$
26**
27****************************************************************************/
28
29#include "lupdate.h"
30
31#include "simtexth.h"
32#include "translator.h"
33
34#include <QtCore/QCoreApplication>
35#include <QtCore/QDebug>
36#include <QtCore/QMap>
37#include <QtCore/QStringList>
38#include <QtCore/QVector>
39
40QT_BEGIN_NAMESPACE
41
42static bool isDigitFriendly(QChar c)
43{
44 return c.isPunct() || c.isSpace();
45}
46
47static int numberLength(const QString &s, int i)
48{
49 if (i >= s.size() || !s.at(i).isDigit())
50 return 0;
51
52 int pos = i;
53 do {
54 ++i;
55 } while (i < s.size()
56 && (s.at(i).isDigit()
57 || (isDigitFriendly(c: s[i])
58 && i + 1 < s.size()
59 && (s[i + 1].isDigit()
60 || (isDigitFriendly(c: s[i + 1])
61 && i + 2 < s.size()
62 && s[i + 2].isDigit())))));
63 return i - pos;
64}
65
66
67/*
68 Returns a version of 'key' where all numbers have been replaced by zeroes. If
69 there were none, returns "".
70*/
71static QString zeroKey(const QString &key)
72{
73 QString zeroed;
74 bool metSomething = false;
75
76 for (int i = 0; i < key.size(); ++i) {
77 int len = numberLength(s: key, i);
78 if (len > 0) {
79 i += len;
80 zeroed.append(c: QLatin1Char('0'));
81 metSomething = true;
82 } else {
83 zeroed.append(c: key.at(i));
84 }
85 }
86 return metSomething ? zeroed : QString();
87}
88
89static QString translationAttempt(const QString &oldTranslation,
90 const QString &oldSource, const QString &newSource)
91{
92 int p = zeroKey(key: oldSource).count(c: QLatin1Char('0'));
93 QString attempt;
94 QStringList oldNumbers;
95 QStringList newNumbers;
96 QVector<bool> met(p);
97 QVector<int> matchedYet(p);
98 int i, j;
99 int k = 0, ell, best;
100 int m, n;
101 int pass;
102
103 /*
104 This algorithm is hard to follow, so we'll consider an example
105 all along: oldTranslation is "XeT 3.0", oldSource is "TeX 3.0"
106 and newSource is "XeT 3.1".
107
108 First, we set up two tables: oldNumbers and newNumbers. In our
109 example, oldNumber[0] is "3.0" and newNumber[0] is "3.1".
110 */
111 for (i = 0, j = 0; i < oldSource.size(); i++, j++) {
112 m = numberLength(s: oldSource, i);
113 n = numberLength(s: newSource, i: j);
114 if (m > 0) {
115 oldNumbers.append(t: oldSource.mid(position: i, n: m + 1));
116 newNumbers.append(t: newSource.mid(position: j, n: n + 1));
117 i += m;
118 j += n;
119 met[k] = false;
120 matchedYet[k] = 0;
121 k++;
122 }
123 }
124
125 /*
126 We now go over the old translation, "XeT 3.0", one letter at a
127 time, looking for numbers found in oldNumbers. Whenever such a
128 number is met, it is replaced with its newNumber equivalent. In
129 our example, the "3.0" of "XeT 3.0" becomes "3.1".
130 */
131 for (i = 0; i < oldTranslation.length(); i++) {
132 attempt += oldTranslation[i];
133 for (k = 0; k < p; k++) {
134 if (oldTranslation[i] == oldNumbers[k][matchedYet[k]])
135 matchedYet[k]++;
136 else
137 matchedYet[k] = 0;
138 }
139
140 /*
141 Let's find out if the last character ended a match. We make
142 two passes over the data. In the first pass, we try to
143 match only numbers that weren't matched yet; if that fails,
144 the second pass does the trick. This is useful in some
145 suspicious cases, flagged below.
146 */
147 for (pass = 0; pass < 2; pass++) {
148 best = p; // an impossible value
149 for (k = 0; k < p; k++) {
150 if ((!met[k] || pass > 0) &&
151 matchedYet[k] == oldNumbers[k].length() &&
152 numberLength(s: oldTranslation, i: i + 1 - matchedYet[k]) == matchedYet[k]) {
153 // the longer the better
154 if (best == p || matchedYet[k] > matchedYet[best])
155 best = k;
156 }
157 }
158 if (best != p) {
159 attempt.truncate(pos: attempt.length() - matchedYet[best]);
160 attempt += newNumbers[best];
161 met[best] = true;
162 for (k = 0; k < p; k++)
163 matchedYet[k] = 0;
164 break;
165 }
166 }
167 }
168
169 /*
170 We flag two kinds of suspicious cases. They are identified as
171 such with comments such as "{2000?}" at the end.
172
173 Example of the first kind: old source text "TeX 3.0" translated
174 as "XeT 2.0" is flagged "TeX 2.0 {3.0?}", no matter what the
175 new text is.
176 */
177 for (k = 0; k < p; k++) {
178 if (!met[k])
179 attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String("?}");
180 }
181
182 /*
183 Example of the second kind: "1 of 1" translated as "1 af 1",
184 with new source text "1 of 2", generates "1 af 2 {1 or 2?}"
185 because it's not clear which of "1 af 2" and "2 af 1" is right.
186 */
187 for (k = 0; k < p; k++) {
188 for (ell = 0; ell < p; ell++) {
189 if (k != ell && oldNumbers[k] == oldNumbers[ell] &&
190 newNumbers[k] < newNumbers[ell])
191 attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String(" or ") +
192 newNumbers[ell] + QLatin1String("?}");
193 }
194 }
195 return attempt;
196}
197
198
199/*
200 Augments a Translator with translations easily derived from
201 similar existing (probably obsolete) translations.
202
203 For example, if "TeX 3.0" is translated as "XeT 3.0" and "TeX 3.1"
204 has no translation, "XeT 3.1" is added to the translator and is
205 marked Unfinished.
206
207 Returns the number of additional messages that this heuristic translated.
208*/
209int applyNumberHeuristic(Translator &tor)
210{
211 QMap<QString, QPair<QString, QString> > translated;
212 QVector<bool> untranslated(tor.messageCount());
213 int inserted = 0;
214
215 for (int i = 0; i < tor.messageCount(); ++i) {
216 const TranslatorMessage &msg = tor.message(i);
217 bool hasTranslation = msg.isTranslated();
218 if (msg.type() == TranslatorMessage::Unfinished) {
219 if (!hasTranslation)
220 untranslated[i] = true;
221 } else if (hasTranslation && msg.translations().count() == 1) {
222 const QString &key = zeroKey(key: msg.sourceText());
223 if (!key.isEmpty())
224 translated.insert(akey: key, avalue: qMakePair(x: msg.sourceText(), y: msg.translation()));
225 }
226 }
227
228 for (int i = 0; i < tor.messageCount(); ++i) {
229 if (untranslated[i]) {
230 TranslatorMessage &msg = tor.message(i);
231 const QString &key = zeroKey(key: msg.sourceText());
232 if (!key.isEmpty()) {
233 QMap<QString, QPair<QString, QString> >::ConstIterator t =
234 translated.constFind(akey: key);
235 if (t != translated.constEnd() && t->first != msg.sourceText()) {
236 msg.setTranslation(translationAttempt(oldTranslation: t->second, oldSource: t->first,
237 newSource: msg.sourceText()));
238 inserted++;
239 }
240 }
241 }
242 }
243 return inserted;
244}
245
246
247/*
248 Augments a Translator with trivially derived translations.
249
250 For example, if "Enabled:" is consistendly translated as "Eingeschaltet:" no
251 matter the context or the comment, "Eingeschaltet:" is added as the
252 translation of any untranslated "Enabled:" text and is marked Unfinished.
253
254 Returns the number of additional messages that this heuristic translated.
255*/
256
257int applySameTextHeuristic(Translator &tor)
258{
259 QMap<QString, QStringList> translated;
260 QMap<QString, bool> avoid; // Want a QTreeSet, in fact
261 QVector<bool> untranslated(tor.messageCount());
262 int inserted = 0;
263
264 for (int i = 0; i < tor.messageCount(); ++i) {
265 const TranslatorMessage &msg = tor.message(i);
266 if (!msg.isTranslated()) {
267 if (msg.type() == TranslatorMessage::Unfinished)
268 untranslated[i] = true;
269 } else {
270 const QString &key = msg.sourceText();
271 QMap<QString, QStringList>::ConstIterator t = translated.constFind(akey: key);
272 if (t != translated.constEnd()) {
273 /*
274 The same source text is translated at least two
275 different ways. Do nothing then.
276 */
277 if (*t != msg.translations()) {
278 translated.remove(akey: key);
279 avoid.insert(akey: key, avalue: true);
280 }
281 } else if (!avoid.contains(akey: key)) {
282 translated.insert(akey: key, avalue: msg.translations());
283 }
284 }
285 }
286
287 for (int i = 0; i < tor.messageCount(); ++i) {
288 if (untranslated[i]) {
289 TranslatorMessage &msg = tor.message(i);
290 QMap<QString, QStringList>::ConstIterator t = translated.constFind(akey: msg.sourceText());
291 if (t != translated.constEnd()) {
292 msg.setTranslations(*t);
293 ++inserted;
294 }
295 }
296 }
297 return inserted;
298}
299
300
301
302/*
303 Merges two Translator objects. The first one
304 is a set of source texts and translations for a previous version of
305 the internationalized program; the second one is a set of fresh
306 source texts newly extracted from the source code, without any
307 translation yet.
308*/
309
310Translator merge(
311 const Translator &tor, const Translator &virginTor, const QList<Translator> &aliens,
312 UpdateOptions options, QString &err)
313{
314 int known = 0;
315 int neww = 0;
316 int obsoleted = 0;
317 int similarTextHeuristicCount = 0;
318
319 Translator outTor;
320 outTor.setLanguageCode(tor.languageCode());
321 outTor.setSourceLanguageCode(tor.sourceLanguageCode());
322 outTor.setLocationsType(tor.locationsType());
323
324 /*
325 The types of all the messages from the vernacular translator
326 are updated according to the virgin translator.
327 */
328 foreach (TranslatorMessage m, tor.messages()) {
329 TranslatorMessage::Type newType = TranslatorMessage::Finished;
330
331 if (m.sourceText().isEmpty() && m.id().isEmpty()) {
332 // context/file comment
333 int mvi = virginTor.find(context: m.context());
334 if (mvi >= 0)
335 m.setComment(virginTor.constMessage(i: mvi).comment());
336 } else {
337 TranslatorMessage::ExtraData extras;
338 const TranslatorMessage *mv;
339 int mvi = virginTor.find(msg: m);
340 if (mvi < 0) {
341 if (!(options & HeuristicSimilarText)) {
342 makeObsolete:
343 switch (m.type()) {
344 case TranslatorMessage::Finished:
345 newType = TranslatorMessage::Vanished;
346 obsoleted++;
347 break;
348 case TranslatorMessage::Unfinished:
349 newType = TranslatorMessage::Obsolete;
350 obsoleted++;
351 break;
352 default:
353 newType = m.type();
354 break;
355 }
356 m.clearReferences();
357 } else {
358 mvi = virginTor.find(context: m.context(), comment: m.comment(), refs: m.allReferences());
359 if (mvi < 0) {
360 // did not find it in the virgin, mark it as obsolete
361 goto makeObsolete;
362 }
363 mv = &virginTor.constMessage(i: mvi);
364 // Do not just accept it if its on the same line number,
365 // but different source text.
366 // Also check if the texts are more or less similar before
367 // we consider them to represent the same message...
368 if (getSimilarityScore(str1: m.sourceText(), str2: mv->sourceText()) < textSimilarityThreshold) {
369 // The virgin and vernacular sourceTexts are so different that we could not find it
370 goto makeObsolete;
371 }
372 // It is just slightly modified, assume that it is the same string
373
374 extras = mv->extras();
375
376 // Mark it as unfinished. (Since the source text
377 // was changed it might require re-translating...)
378 newType = TranslatorMessage::Unfinished;
379 ++similarTextHeuristicCount;
380 neww++;
381 goto outdateSource;
382 }
383 } else {
384 mv = &virginTor.message(i: mvi);
385 extras = mv->extras();
386 if (!mv->id().isEmpty()
387 && (mv->context() != m.context()
388 || mv->sourceText() != m.sourceText()
389 || mv->comment() != m.comment())) {
390 known++;
391 newType = TranslatorMessage::Unfinished;
392 m.setContext(mv->context());
393 m.setComment(mv->comment());
394 if (mv->sourceText() != m.sourceText()) {
395 outdateSource:
396 m.setOldSourceText(m.sourceText());
397 m.setSourceText(mv->sourceText());
398 const QString &oldpluralsource = m.extra(ba: QLatin1String("po-msgid_plural"));
399 if (!oldpluralsource.isEmpty())
400 extras.insert(akey: QLatin1String("po-old_msgid_plural"), avalue: oldpluralsource);
401 }
402 } else {
403 switch (m.type()) {
404 case TranslatorMessage::Finished:
405 default:
406 if (m.isPlural() == mv->isPlural()) {
407 newType = TranslatorMessage::Finished;
408 } else {
409 newType = TranslatorMessage::Unfinished;
410 }
411 known++;
412 break;
413 case TranslatorMessage::Unfinished:
414 newType = TranslatorMessage::Unfinished;
415 known++;
416 break;
417 case TranslatorMessage::Vanished:
418 newType = TranslatorMessage::Finished;
419 neww++;
420 break;
421 case TranslatorMessage::Obsolete:
422 newType = TranslatorMessage::Unfinished;
423 neww++;
424 break;
425 }
426 }
427
428 // Always get the filename and linenumber info from the
429 // virgin Translator, in case it has changed location.
430 // This should also enable us to read a file that does not
431 // have the <location> element.
432 // why not use operator=()? Because it overwrites e.g. userData.
433 m.setReferences(mv->allReferences());
434 m.setPlural(mv->isPlural());
435 m.setExtras(extras);
436 m.setExtraComment(mv->extraComment());
437 m.setId(mv->id());
438 }
439 }
440
441 m.setType(newType);
442 outTor.append(msg: m);
443 }
444
445 /*
446 Messages found only in the virgin translator are added to the
447 vernacular translator.
448 */
449 foreach (const TranslatorMessage &mv, virginTor.messages()) {
450 if (mv.sourceText().isEmpty() && mv.id().isEmpty()) {
451 if (tor.find(context: mv.context()) >= 0)
452 continue;
453 } else {
454 if (tor.find(msg: mv) >= 0)
455 continue;
456 if (options & HeuristicSimilarText) {
457 int mi = tor.find(context: mv.context(), comment: mv.comment(), refs: mv.allReferences());
458 if (mi >= 0) {
459 // The similar message found in tor (ts file) must NOT correspond exactly
460 // to an other message is virginTor
461 if (virginTor.find(msg: tor.constMessage(i: mi)) < 0) {
462 if (getSimilarityScore(str1: tor.constMessage(i: mi).sourceText(), str2: mv.sourceText())
463 >= textSimilarityThreshold)
464 continue;
465 }
466 }
467 }
468 }
469 if (options & NoLocations)
470 outTor.append(msg: mv);
471 else
472 outTor.appendSorted(msg: mv);
473 if (!mv.sourceText().isEmpty() || !mv.id().isEmpty())
474 ++neww;
475 }
476
477 /*
478 "Alien" translators can be used to augment the vernacular translator.
479 */
480 foreach (const Translator &alf, aliens) {
481 foreach (TranslatorMessage mv, alf.messages()) {
482 if (mv.sourceText().isEmpty() || !mv.isTranslated())
483 continue;
484 int mvi = outTor.find(msg: mv);
485 if (mvi >= 0) {
486 TranslatorMessage &tm = outTor.message(i: mvi);
487 if (tm.type() != TranslatorMessage::Finished && !tm.isTranslated()) {
488 tm.setTranslations(mv.translations());
489 --neww;
490 ++known;
491 }
492 } else {
493 /*
494 * Don't do simtex search, as the locations are likely to be
495 * completely off anyway, so we'd find nothing.
496 */
497 /*
498 * Add the unmatched messages as obsoletes, so the Linguist GUI
499 * will offer them as possible translations.
500 */
501 mv.clearReferences();
502 mv.setType(mv.type() == TranslatorMessage::Finished
503 ? TranslatorMessage::Vanished : TranslatorMessage::Obsolete);
504 if (options & NoLocations)
505 outTor.append(msg: mv);
506 else
507 outTor.appendSorted(msg: mv);
508 ++known;
509 ++obsoleted;
510 }
511 }
512 }
513
514 /*
515 The same-text heuristic handles cases where a message has an
516 obsolete counterpart with a different context or comment.
517 */
518 int sameTextHeuristicCount = (options & HeuristicSameText) ? applySameTextHeuristic(tor&: outTor) : 0;
519
520 /*
521 The number heuristic handles cases where a message has an
522 obsolete counterpart with mostly numbers differing in the
523 source text.
524 */
525 int sameNumberHeuristicCount = (options & HeuristicNumber) ? applyNumberHeuristic(tor&: outTor) : 0;
526
527 if (options & Verbose) {
528 int totalFound = neww + known;
529 err += LU::tr(sourceText: " Found %n source text(s) (%1 new and %2 already existing)\n", disambiguation: 0, n: totalFound).arg(a: neww).arg(a: known);
530
531 if (obsoleted) {
532 if (options & NoObsolete) {
533 err += LU::tr(sourceText: " Removed %n obsolete entries\n", disambiguation: 0, n: obsoleted);
534 } else {
535 err += LU::tr(sourceText: " Kept %n obsolete entries\n", disambiguation: 0, n: obsoleted);
536 }
537 }
538
539 if (sameNumberHeuristicCount)
540 err += LU::tr(sourceText: " Number heuristic provided %n translation(s)\n",
541 disambiguation: 0, n: sameNumberHeuristicCount);
542 if (sameTextHeuristicCount)
543 err += LU::tr(sourceText: " Same-text heuristic provided %n translation(s)\n",
544 disambiguation: 0, n: sameTextHeuristicCount);
545 if (similarTextHeuristicCount)
546 err += LU::tr(sourceText: " Similar-text heuristic provided %n translation(s)\n",
547 disambiguation: 0, n: similarTextHeuristicCount);
548 }
549 return outTor;
550}
551
552QT_END_NAMESPACE
553

source code of qttools/src/linguist/lupdate/merge.cpp