1 | /* |
2 | Open Asset Import Library (assimp) |
3 | ---------------------------------------------------------------------- |
4 | |
5 | Copyright (c) 2006-2017, assimp team |
6 | |
7 | All rights reserved. |
8 | |
9 | Redistribution and use of this software in source and binary forms, |
10 | with or without modification, are permitted provided that the |
11 | following conditions are met: |
12 | |
13 | * Redistributions of source code must retain the above |
14 | copyright notice, this list of conditions and the |
15 | following disclaimer. |
16 | |
17 | * Redistributions in binary form must reproduce the above |
18 | copyright notice, this list of conditions and the |
19 | following disclaimer in the documentation and/or other |
20 | materials provided with the distribution. |
21 | |
22 | * Neither the name of the assimp team, nor the names of its |
23 | contributors may be used to endorse or promote products |
24 | derived from this software without specific prior |
25 | written permission of the assimp team. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
28 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
29 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
30 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
31 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
32 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
33 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
34 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
35 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
36 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
37 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
38 | |
39 | ---------------------------------------------------------------------- |
40 | */ |
41 | |
42 | /** @file STEPFileEncoding.cpp |
43 | * @brief STEP character handling, string un-escaping |
44 | */ |
45 | #include "STEPFileEncoding.h" |
46 | #include "fast_atof.h" |
47 | #include <contrib/utf8cpp/source/utf8.h> |
48 | |
49 | #include <memory> |
50 | |
51 | using namespace Assimp; |
52 | |
53 | // roman1 to utf16 table |
54 | static const uint16_t mac_codetable[] = { |
55 | // 0x20 unassig./nonprint. slots |
56 | 0x0020 , |
57 | 0x0021 , |
58 | 0x0022 , |
59 | 0x0023 , |
60 | 0x0024 , |
61 | 0x0025 , |
62 | 0x0026 , |
63 | 0x0027 , |
64 | 0x0028 , |
65 | 0x0029 , |
66 | 0x002A , |
67 | 0x002B , |
68 | 0x002C , |
69 | 0x002D , |
70 | 0x002E , |
71 | 0x002F , |
72 | 0x0030 , |
73 | 0x0031 , |
74 | 0x0032 , |
75 | 0x0033 , |
76 | 0x0034 , |
77 | 0x0035 , |
78 | 0x0036 , |
79 | 0x0037 , |
80 | 0x0038 , |
81 | 0x0039 , |
82 | 0x003A , |
83 | 0x003B , |
84 | 0x003C , |
85 | 0x003D , |
86 | 0x003E , |
87 | 0x003F , |
88 | 0x0040 , |
89 | 0x0041 , |
90 | 0x0042 , |
91 | 0x0043 , |
92 | 0x0044 , |
93 | 0x0045 , |
94 | 0x0046 , |
95 | 0x0047 , |
96 | 0x0048 , |
97 | 0x0049 , |
98 | 0x004A , |
99 | 0x004B , |
100 | 0x004C , |
101 | 0x004D , |
102 | 0x004E , |
103 | 0x004F , |
104 | 0x0050 , |
105 | 0x0051 , |
106 | 0x0052 , |
107 | 0x0053 , |
108 | 0x0054 , |
109 | 0x0055 , |
110 | 0x0056 , |
111 | 0x0057 , |
112 | 0x0058 , |
113 | 0x0059 , |
114 | 0x005A , |
115 | 0x005B , |
116 | 0x005C , |
117 | 0x005D , |
118 | 0x005E , |
119 | 0x005F , |
120 | 0x0060 , |
121 | 0x0061 , |
122 | 0x0062 , |
123 | 0x0063 , |
124 | 0x0064 , |
125 | 0x0065 , |
126 | 0x0066 , |
127 | 0x0067 , |
128 | 0x0068 , |
129 | 0x0069 , |
130 | 0x006A , |
131 | 0x006B , |
132 | 0x006C , |
133 | 0x006D , |
134 | 0x006E , |
135 | 0x006F , |
136 | 0x0070 , |
137 | 0x0071 , |
138 | 0x0072 , |
139 | 0x0073 , |
140 | 0x0074 , |
141 | 0x0075 , |
142 | 0x0076 , |
143 | 0x0077 , |
144 | 0x0078 , |
145 | 0x0079 , |
146 | 0x007A , |
147 | 0x007B , |
148 | 0x007C , |
149 | 0x007D , |
150 | 0x007E , |
151 | 0x0000 , // unassig. |
152 | 0x00C4 , |
153 | 0x00C5 , |
154 | 0x00C7 , |
155 | 0x00C9 , |
156 | 0x00D1 , |
157 | 0x00D6 , |
158 | 0x00DC , |
159 | 0x00E1 , |
160 | 0x00E0 , |
161 | 0x00E2 , |
162 | 0x00E4 , |
163 | 0x00E3 , |
164 | 0x00E5 , |
165 | 0x00E7 , |
166 | 0x00E9 , |
167 | 0x00E8 , |
168 | 0x00EA , |
169 | 0x00EB , |
170 | 0x00ED , |
171 | 0x00EC , |
172 | 0x00EE , |
173 | 0x00EF , |
174 | 0x00F1 , |
175 | 0x00F3 , |
176 | 0x00F2 , |
177 | 0x00F4 , |
178 | 0x00F6 , |
179 | 0x00F5 , |
180 | 0x00FA , |
181 | 0x00F9 , |
182 | 0x00FB , |
183 | 0x00FC , |
184 | 0x2020 , |
185 | 0x00B0 , |
186 | 0x00A2 , |
187 | 0x00A3 , |
188 | 0x00A7 , |
189 | 0x2022 , |
190 | 0x00B6 , |
191 | 0x00DF , |
192 | 0x00AE , |
193 | 0x00A9 , |
194 | 0x2122 , |
195 | 0x00B4 , |
196 | 0x00A8 , |
197 | 0x2260 , |
198 | 0x00C6 , |
199 | 0x00D8 , |
200 | 0x221E , |
201 | 0x00B1 , |
202 | 0x2264 , |
203 | 0x2265 , |
204 | 0x00A5 , |
205 | 0x00B5 , |
206 | 0x2202 , |
207 | 0x2211 , |
208 | 0x220F , |
209 | 0x03C0 , |
210 | 0x222B , |
211 | 0x00AA , |
212 | 0x00BA , |
213 | 0x03A9 , |
214 | 0x00E6 , |
215 | 0x00F8 , |
216 | 0x00BF , |
217 | 0x00A1 , |
218 | 0x00AC , |
219 | 0x221A , |
220 | 0x0192 , |
221 | 0x2248 , |
222 | 0x2206 , |
223 | 0x00AB , |
224 | 0x00BB , |
225 | 0x2026 , |
226 | 0x00A0 , |
227 | 0x00C0 , |
228 | 0x00C3 , |
229 | 0x00D5 , |
230 | 0x0152 , |
231 | 0x0153 , |
232 | 0x2013 , |
233 | 0x2014 , |
234 | 0x201C , |
235 | 0x201D , |
236 | 0x2018 , |
237 | 0x2019 , |
238 | 0x00F7 , |
239 | 0x25CA , |
240 | 0x00FF , |
241 | 0x0178 , |
242 | 0x2044 , |
243 | 0x20AC , |
244 | 0x2039 , |
245 | 0x203A , |
246 | 0xFB01 , |
247 | 0xFB02 , |
248 | 0x2021 , |
249 | 0x00B7 , |
250 | 0x201A , |
251 | 0x201E , |
252 | 0x2030 , |
253 | 0x00C2 , |
254 | 0x00CA , |
255 | 0x00C1 , |
256 | 0x00CB , |
257 | 0x00C8 , |
258 | 0x00CD , |
259 | 0x00CE , |
260 | 0x00CF , |
261 | 0x00CC , |
262 | 0x00D3 , |
263 | 0x00D4 , |
264 | 0xF8FF , |
265 | 0x00D2 , |
266 | 0x00DA , |
267 | 0x00DB , |
268 | 0x00D9 , |
269 | 0x0131 , |
270 | 0x02C6 , |
271 | 0x02DC , |
272 | 0x00AF , |
273 | 0x02D8 , |
274 | 0x02D9 , |
275 | 0x02DA , |
276 | 0x00B8 , |
277 | 0x02DD , |
278 | 0x02DB , |
279 | 0x02C7 |
280 | }; |
281 | |
282 | // ------------------------------------------------------------------------------------------------ |
283 | bool STEP::StringToUTF8(std::string& s) |
284 | { |
285 | // very basic handling for escaped string sequences |
286 | // http://doc.spatial.com/index.php?title=InterOp:Connect/STEP&redirect=no |
287 | |
288 | for (size_t i = 0; i < s.size(); ) { |
289 | if (s[i] == '\\') { |
290 | // \S\X - cp1252 (X is the character remapped to [0,127]) |
291 | if (i+3 < s.size() && s[i+1] == 'S' && s[i+2] == '\\') { |
292 | // http://stackoverflow.com/questions/5586214/how-to-convert-char-from-iso-8859-1-to-utf-8-in-c-multiplatformly |
293 | ai_assert((uint8_t)s[i+3] < 0x80); |
294 | const uint8_t ch = s[i+3] + 0x80; |
295 | |
296 | s[i] = 0xc0 | (ch & 0xc0) >> 6; |
297 | s[i+1] = 0x80 | (ch & 0x3f); |
298 | |
299 | s.erase(i + 2,2); |
300 | ++i; |
301 | } |
302 | // \X\xx - mac/roman (xx is a hex sequence) |
303 | else if (i+4 < s.size() && s[i+1] == 'X' && s[i+2] == '\\') { |
304 | |
305 | const uint8_t macval = HexOctetToDecimal(s.c_str() + i + 3); |
306 | if(macval < 0x20) { |
307 | return false; |
308 | } |
309 | |
310 | ai_assert(sizeof(mac_codetable) / sizeof(mac_codetable[0]) == 0x100-0x20); |
311 | |
312 | const uint32_t unival = mac_codetable[macval - 0x20], *univalp = &unival; |
313 | |
314 | unsigned char temp[5], *tempp = temp; |
315 | ai_assert(sizeof( unsigned char ) == 1); |
316 | |
317 | utf8::utf32to8( univalp, univalp + 1, tempp ); |
318 | |
319 | const size_t outcount = static_cast<size_t>(tempp-temp); |
320 | |
321 | s.erase(i,5); |
322 | s.insert(i, reinterpret_cast<char*>(temp), outcount); |
323 | i += outcount; |
324 | } |
325 | // \Xn\ .. \X0\ - various unicode encodings (n=2: utf16; n=4: utf32) |
326 | else if (i+3 < s.size() && s[i+1] == 'X' && s[i+2] >= '0' && s[i+2] <= '9') { |
327 | switch(s[i+2]) { |
328 | // utf16 |
329 | case '2': |
330 | // utf32 |
331 | case '4': |
332 | if (s[i+3] == '\\') { |
333 | const size_t basei = i+4; |
334 | size_t j = basei, jend = s.size()-3; |
335 | |
336 | for (; j < jend; ++j) { |
337 | if (s[j] == '\\' && s[j+1] == 'X' && s[j+2] == '0' && s[j+3] == '\\') { |
338 | break; |
339 | } |
340 | } |
341 | if (j == jend) { |
342 | return false; |
343 | } |
344 | |
345 | if (j == basei) { |
346 | s.erase(i,8); |
347 | continue; |
348 | } |
349 | |
350 | if (s[i+2] == '2') { |
351 | if (((j - basei) % 4) != 0) { |
352 | return false; |
353 | } |
354 | |
355 | const size_t count = (j-basei)/4; |
356 | std::unique_ptr<uint16_t[]> src(new uint16_t[count]); |
357 | |
358 | const char* cur = s.c_str() + basei; |
359 | for (size_t k = 0; k < count; ++k, cur += 4) { |
360 | src[k] = (static_cast<uint16_t>(HexOctetToDecimal(cur)) << 8u) | |
361 | static_cast<uint16_t>(HexOctetToDecimal(cur+2)); |
362 | } |
363 | |
364 | const size_t dcount = count * 3; // this is enough to hold all possible outputs |
365 | std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]); |
366 | |
367 | const uint16_t* srct = src.get(); |
368 | unsigned char* destt = dest.get(); |
369 | utf8::utf16to8( srct, srct + count, destt ); |
370 | |
371 | const size_t outcount = static_cast<size_t>(destt-dest.get()); |
372 | |
373 | s.erase(i,(j+4-i)); |
374 | |
375 | ai_assert(sizeof(unsigned char) == 1); |
376 | s.insert(i, reinterpret_cast<char*>(dest.get()), outcount); |
377 | |
378 | i += outcount; |
379 | continue; |
380 | } |
381 | else if (s[i+2] == '4') { |
382 | if (((j - basei) % 8) != 0) { |
383 | return false; |
384 | } |
385 | |
386 | const size_t count = (j-basei)/8; |
387 | std::unique_ptr<uint32_t[]> src(new uint32_t[count]); |
388 | |
389 | const char* cur = s.c_str() + basei; |
390 | for (size_t k = 0; k < count; ++k, cur += 8) { |
391 | src[k] = (static_cast<uint32_t>(HexOctetToDecimal(cur )) << 24u) | |
392 | (static_cast<uint32_t>(HexOctetToDecimal(cur+2)) << 16u) | |
393 | (static_cast<uint32_t>(HexOctetToDecimal(cur+4)) << 8u) | |
394 | (static_cast<uint32_t>(HexOctetToDecimal(cur+6))); |
395 | } |
396 | |
397 | const size_t dcount = count * 5; // this is enough to hold all possible outputs |
398 | std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]); |
399 | |
400 | const uint32_t* srct = src.get(); |
401 | unsigned char* destt = dest.get(); |
402 | utf8::utf32to8( srct, srct + count, destt ); |
403 | |
404 | const size_t outcount = static_cast<size_t>(destt-dest.get()); |
405 | |
406 | s.erase(i,(j+4-i)); |
407 | |
408 | ai_assert(sizeof(unsigned char) == 1); |
409 | s.insert(i, reinterpret_cast<char*>(dest.get()), outcount); |
410 | |
411 | i += outcount; |
412 | continue; |
413 | } |
414 | } |
415 | break; |
416 | |
417 | // TODO: other encoding patterns? |
418 | |
419 | default: |
420 | return false; |
421 | } |
422 | } |
423 | } |
424 | ++i; |
425 | } |
426 | return true; |
427 | } |
428 | |