1/*
2 * Copyright (C) 2010 Apple Inc. All rights reserved.
3 * Copyright (C) 2015 Igalia S.L.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
15 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
18 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
24 * THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "Hyphenation.h"
29
30#if USE(LIBHYPHEN)
31
32#include "FileSystem.h"
33#include <hyphen.h>
34#include <limits>
35#include <stdlib.h>
36#include <wtf/HashMap.h>
37#include <wtf/NeverDestroyed.h>
38#include <wtf/TinyLRUCache.h>
39#include <wtf/text/AtomicStringHash.h>
40#include <wtf/text/CString.h>
41#include <wtf/text/StringView.h>
42
43#if PLATFORM(GTK)
44#include "GtkUtilities.h"
45#include <wtf/glib/GUniquePtr.h>
46#endif
47
48namespace WebCore {
49
50static const char* const gDictionaryDirectories[] = {
51 "/usr/share/hyphen",
52 "/usr/local/share/hyphen",
53};
54
55static String extractLocaleFromDictionaryFilePath(const String& filePath)
56{
57 // Dictionary files always have the form "hyph_<locale name>.dic"
58 // so we strip everything except the locale.
59 String fileName = pathGetFileName(filePath);
60 static const int prefixLength = 5;
61 static const int suffixLength = 4;
62 return fileName.substring(prefixLength, fileName.length() - prefixLength - suffixLength);
63}
64
65static void scanDirectoryForDicionaries(const char* directoryPath, HashMap<AtomicString, Vector<String>>& availableLocales)
66{
67 for (auto& filePath : listDirectory(directoryPath, "hyph_*.dic")) {
68 String locale = extractLocaleFromDictionaryFilePath(filePath).convertToASCIILowercase();
69
70 char normalizedPath[PATH_MAX];
71 if (!realpath(fileSystemRepresentation(filePath).data(), normalizedPath))
72 continue;
73
74 filePath = stringFromFileSystemRepresentation(normalizedPath);
75 availableLocales.add(locale, Vector<String>()).iterator->value.append(filePath);
76
77 String localeReplacingUnderscores = String(locale);
78 localeReplacingUnderscores.replace('_', '-');
79 if (locale != localeReplacingUnderscores)
80 availableLocales.add(localeReplacingUnderscores, Vector<String>()).iterator->value.append(filePath);
81
82 size_t dividerPosition = localeReplacingUnderscores.find('-');
83 if (dividerPosition != notFound) {
84 localeReplacingUnderscores.truncate(dividerPosition);
85 availableLocales.add(localeReplacingUnderscores, Vector<String>()).iterator->value.append(filePath);
86 }
87 }
88}
89
90#if ENABLE(DEVELOPER_MODE)
91static void scanTestDictionariesDirectoryIfNecessary(HashMap<AtomicString, Vector<String>>& availableLocales)
92{
93 // It's unfortunate that we need to look for the dictionaries this way, but
94 // libhyphen doesn't have the concept of installed dictionaries. Instead,
95 // we have this special case for WebKit tests.
96#if PLATFORM(GTK)
97 CString buildDirectory = webkitBuildDirectory();
98 GUniquePtr<char> dictionariesPath(g_build_filename(buildDirectory.data(), "DependenciesGTK", "Root", "webkitgtk-test-dicts", nullptr));
99 if (g_file_test(dictionariesPath.get(), static_cast<GFileTest>(G_FILE_TEST_IS_DIR))) {
100 scanDirectoryForDicionaries(dictionariesPath.get(), availableLocales);
101 return;
102 }
103
104 // Try alternative dictionaries path for people not using JHBuild.
105 dictionariesPath.reset(g_build_filename(buildDirectory.data(), "webkitgtk-test-dicts", nullptr));
106 scanDirectoryForDicionaries(dictionariesPath.get(), availableLocales);
107#elif defined(TEST_HYPHENATAION_PATH)
108 scanDirectoryForDicionaries(TEST_HYPHENATAION_PATH, availableLocales);
109#endif
110}
111#endif
112
113static HashMap<AtomicString, Vector<String>>& availableLocales()
114{
115 static bool scannedLocales = false;
116 static HashMap<AtomicString, Vector<String>> availableLocales;
117
118 if (!scannedLocales) {
119 for (size_t i = 0; i < WTF_ARRAY_LENGTH(gDictionaryDirectories); i++)
120 scanDirectoryForDicionaries(gDictionaryDirectories[i], availableLocales);
121
122#if ENABLE(DEVELOPER_MODE)
123 scanTestDictionariesDirectoryIfNecessary(availableLocales);
124#endif
125
126 scannedLocales = true;
127 }
128
129 return availableLocales;
130}
131
132bool canHyphenate(const AtomicString& localeIdentifier)
133{
134 if (localeIdentifier.isNull())
135 return false;
136 if (availableLocales().contains(localeIdentifier))
137 return true;
138 return availableLocales().contains(AtomicString(localeIdentifier.string().convertToASCIILowercase()));
139}
140
141class HyphenationDictionary : public RefCounted<HyphenationDictionary> {
142 WTF_MAKE_NONCOPYABLE(HyphenationDictionary);
143 WTF_MAKE_FAST_ALLOCATED;
144public:
145 typedef std::unique_ptr<HyphenDict, void(*)(HyphenDict*)> HyphenDictUniquePtr;
146
147 virtual ~HyphenationDictionary() { }
148 static RefPtr<HyphenationDictionary> createNull()
149 {
150 return adoptRef(new HyphenationDictionary());
151 }
152
153 static RefPtr<HyphenationDictionary> create(const CString& dictPath)
154 {
155 return adoptRef(new HyphenationDictionary(dictPath));
156 }
157
158 HyphenDict* libhyphenDictionary() const
159 {
160 return m_libhyphenDictionary.get();
161 }
162
163private:
164 HyphenationDictionary(const CString& dictPath)
165 : m_libhyphenDictionary(HyphenDictUniquePtr(hnj_hyphen_load(dictPath.data()), hnj_hyphen_free))
166 {
167 }
168
169 HyphenationDictionary()
170 : m_libhyphenDictionary(HyphenDictUniquePtr(nullptr, hnj_hyphen_free))
171 {
172 }
173
174 HyphenDictUniquePtr m_libhyphenDictionary;
175};
176
177template<>
178class TinyLRUCachePolicy<AtomicString, RefPtr<HyphenationDictionary>>
179{
180public:
181 static TinyLRUCache<AtomicString, RefPtr<WebCore::HyphenationDictionary>, 32>& cache()
182 {
183 static NeverDestroyed<TinyLRUCache<AtomicString, RefPtr<WebCore::HyphenationDictionary>, 32>> cache;
184 return cache;
185 }
186
187 static bool isKeyNull(const AtomicString& localeIdentifier)
188 {
189 return localeIdentifier.isNull();
190 }
191
192 static RefPtr<HyphenationDictionary> createValueForNullKey()
193 {
194 return HyphenationDictionary::createNull();
195 }
196
197 static RefPtr<HyphenationDictionary> createValueForKey(const AtomicString& dictionaryPath)
198 {
199 return HyphenationDictionary::create(fileSystemRepresentation(dictionaryPath.string()));
200 }
201};
202
203static void countLeadingSpaces(const CString& utf8String, int32_t& pointerOffset, int32_t& characterOffset)
204{
205 pointerOffset = 0;
206 characterOffset = 0;
207 const char* stringData = utf8String.data();
208 UChar32 character = 0;
209 while (static_cast<unsigned>(pointerOffset) < utf8String.length()) {
210 int32_t nextPointerOffset = pointerOffset;
211 U8_NEXT(stringData, nextPointerOffset, static_cast<int32_t>(utf8String.length()), character);
212
213 if (character < 0 || !u_isUWhiteSpace(character))
214 return;
215
216 pointerOffset = nextPointerOffset;
217 characterOffset++;
218 }
219}
220
221size_t lastHyphenLocation(StringView string, size_t beforeIndex, const AtomicString& localeIdentifier)
222{
223 // libhyphen accepts strings in UTF-8 format, but WebCore can only provide StringView
224 // which stores either UTF-16 or Latin1 data. This is unfortunate for performance
225 // reasons and we should consider switching to a more flexible hyphenation library
226 // if it is available.
227 CString utf8StringCopy = string.toStringWithoutCopying().utf8();
228
229 // WebCore often passes strings like " wordtohyphenate" to the platform layer. Since
230 // libhyphen isn't advanced enough to deal with leading spaces (presumably CoreFoundation
231 // can), we should find the appropriate indexes into the string to skip them.
232 int32_t leadingSpaceBytes;
233 int32_t leadingSpaceCharacters;
234 countLeadingSpaces(utf8StringCopy, leadingSpaceBytes, leadingSpaceCharacters);
235
236 // The libhyphen documentation specifies that this array should be 5 bytes longer than
237 // the byte length of the input string.
238 Vector<char> hyphenArray(utf8StringCopy.length() - leadingSpaceBytes + 5);
239 char* hyphenArrayData = hyphenArray.data();
240
241 String lowercaseLocaleIdentifier = AtomicString(localeIdentifier.string().convertToASCIILowercase());
242 ASSERT(availableLocales().contains(lowercaseLocaleIdentifier));
243 for (const auto& dictionaryPath : availableLocales().get(lowercaseLocaleIdentifier)) {
244 RefPtr<HyphenationDictionary> dictionary = TinyLRUCachePolicy<AtomicString, RefPtr<HyphenationDictionary>>::cache().get(AtomicString(dictionaryPath));
245
246 char** replacements = nullptr;
247 int* positions = nullptr;
248 int* removedCharacterCounts = nullptr;
249 hnj_hyphen_hyphenate2(dictionary->libhyphenDictionary(),
250 utf8StringCopy.data() + leadingSpaceBytes,
251 utf8StringCopy.length() - leadingSpaceBytes,
252 hyphenArrayData,
253 nullptr, /* output parameter for hyphenated word */
254 &replacements,
255 &positions,
256 &removedCharacterCounts);
257
258 if (replacements) {
259 for (unsigned i = 0; i < utf8StringCopy.length() - leadingSpaceBytes - 1; i++)
260 free(replacements[i]);
261 free(replacements);
262 }
263
264 free(positions);
265 free(removedCharacterCounts);
266
267 for (int i = beforeIndex - leadingSpaceCharacters - 2; i >= 0; i--) {
268 // libhyphen will put an odd number in hyphenArrayData at all
269 // hyphenation points. A number & 1 will be true for odd numbers.
270 if (hyphenArrayData[i] & 1)
271 return i + 1 + leadingSpaceCharacters;
272 }
273 }
274
275 return 0;
276}
277
278} // namespace WebCore
279
280#endif // USE(LIBHYPHEN)
281