1/*
2Open Asset Import Library (assimp)
3----------------------------------------------------------------------
4
5Copyright (c) 2006-2017, assimp team
6
7All rights reserved.
8
9Redistribution and use of this software in source and binary forms,
10with or without modification, are permitted provided that the
11following conditions are met:
12
13* Redistributions of source code must retain the above
14 copyright notice, this list of conditions and the
15 following disclaimer.
16
17* Redistributions in binary form must reproduce the above
18 copyright notice, this list of conditions and the
19 following disclaimer in the documentation and/or other
20 materials provided with the distribution.
21
22* Neither the name of the assimp team, nor the names of its
23 contributors may be used to endorse or promote products
24 derived from this software without specific prior
25 written permission of the assimp team.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
39----------------------------------------------------------------------
40*/
41
42/** @file STEPFileEncoding.cpp
43 * @brief STEP character handling, string un-escaping
44 */
45#include "STEPFileEncoding.h"
46#include "fast_atof.h"
47#include <contrib/utf8cpp/source/utf8.h>
48
49#include <memory>
50
51using namespace Assimp;
52
53// roman1 to utf16 table
54static const uint16_t mac_codetable[] = {
55 // 0x20 unassig./nonprint. slots
56 0x0020 ,
57 0x0021 ,
58 0x0022 ,
59 0x0023 ,
60 0x0024 ,
61 0x0025 ,
62 0x0026 ,
63 0x0027 ,
64 0x0028 ,
65 0x0029 ,
66 0x002A ,
67 0x002B ,
68 0x002C ,
69 0x002D ,
70 0x002E ,
71 0x002F ,
72 0x0030 ,
73 0x0031 ,
74 0x0032 ,
75 0x0033 ,
76 0x0034 ,
77 0x0035 ,
78 0x0036 ,
79 0x0037 ,
80 0x0038 ,
81 0x0039 ,
82 0x003A ,
83 0x003B ,
84 0x003C ,
85 0x003D ,
86 0x003E ,
87 0x003F ,
88 0x0040 ,
89 0x0041 ,
90 0x0042 ,
91 0x0043 ,
92 0x0044 ,
93 0x0045 ,
94 0x0046 ,
95 0x0047 ,
96 0x0048 ,
97 0x0049 ,
98 0x004A ,
99 0x004B ,
100 0x004C ,
101 0x004D ,
102 0x004E ,
103 0x004F ,
104 0x0050 ,
105 0x0051 ,
106 0x0052 ,
107 0x0053 ,
108 0x0054 ,
109 0x0055 ,
110 0x0056 ,
111 0x0057 ,
112 0x0058 ,
113 0x0059 ,
114 0x005A ,
115 0x005B ,
116 0x005C ,
117 0x005D ,
118 0x005E ,
119 0x005F ,
120 0x0060 ,
121 0x0061 ,
122 0x0062 ,
123 0x0063 ,
124 0x0064 ,
125 0x0065 ,
126 0x0066 ,
127 0x0067 ,
128 0x0068 ,
129 0x0069 ,
130 0x006A ,
131 0x006B ,
132 0x006C ,
133 0x006D ,
134 0x006E ,
135 0x006F ,
136 0x0070 ,
137 0x0071 ,
138 0x0072 ,
139 0x0073 ,
140 0x0074 ,
141 0x0075 ,
142 0x0076 ,
143 0x0077 ,
144 0x0078 ,
145 0x0079 ,
146 0x007A ,
147 0x007B ,
148 0x007C ,
149 0x007D ,
150 0x007E ,
151 0x0000 , // unassig.
152 0x00C4 ,
153 0x00C5 ,
154 0x00C7 ,
155 0x00C9 ,
156 0x00D1 ,
157 0x00D6 ,
158 0x00DC ,
159 0x00E1 ,
160 0x00E0 ,
161 0x00E2 ,
162 0x00E4 ,
163 0x00E3 ,
164 0x00E5 ,
165 0x00E7 ,
166 0x00E9 ,
167 0x00E8 ,
168 0x00EA ,
169 0x00EB ,
170 0x00ED ,
171 0x00EC ,
172 0x00EE ,
173 0x00EF ,
174 0x00F1 ,
175 0x00F3 ,
176 0x00F2 ,
177 0x00F4 ,
178 0x00F6 ,
179 0x00F5 ,
180 0x00FA ,
181 0x00F9 ,
182 0x00FB ,
183 0x00FC ,
184 0x2020 ,
185 0x00B0 ,
186 0x00A2 ,
187 0x00A3 ,
188 0x00A7 ,
189 0x2022 ,
190 0x00B6 ,
191 0x00DF ,
192 0x00AE ,
193 0x00A9 ,
194 0x2122 ,
195 0x00B4 ,
196 0x00A8 ,
197 0x2260 ,
198 0x00C6 ,
199 0x00D8 ,
200 0x221E ,
201 0x00B1 ,
202 0x2264 ,
203 0x2265 ,
204 0x00A5 ,
205 0x00B5 ,
206 0x2202 ,
207 0x2211 ,
208 0x220F ,
209 0x03C0 ,
210 0x222B ,
211 0x00AA ,
212 0x00BA ,
213 0x03A9 ,
214 0x00E6 ,
215 0x00F8 ,
216 0x00BF ,
217 0x00A1 ,
218 0x00AC ,
219 0x221A ,
220 0x0192 ,
221 0x2248 ,
222 0x2206 ,
223 0x00AB ,
224 0x00BB ,
225 0x2026 ,
226 0x00A0 ,
227 0x00C0 ,
228 0x00C3 ,
229 0x00D5 ,
230 0x0152 ,
231 0x0153 ,
232 0x2013 ,
233 0x2014 ,
234 0x201C ,
235 0x201D ,
236 0x2018 ,
237 0x2019 ,
238 0x00F7 ,
239 0x25CA ,
240 0x00FF ,
241 0x0178 ,
242 0x2044 ,
243 0x20AC ,
244 0x2039 ,
245 0x203A ,
246 0xFB01 ,
247 0xFB02 ,
248 0x2021 ,
249 0x00B7 ,
250 0x201A ,
251 0x201E ,
252 0x2030 ,
253 0x00C2 ,
254 0x00CA ,
255 0x00C1 ,
256 0x00CB ,
257 0x00C8 ,
258 0x00CD ,
259 0x00CE ,
260 0x00CF ,
261 0x00CC ,
262 0x00D3 ,
263 0x00D4 ,
264 0xF8FF ,
265 0x00D2 ,
266 0x00DA ,
267 0x00DB ,
268 0x00D9 ,
269 0x0131 ,
270 0x02C6 ,
271 0x02DC ,
272 0x00AF ,
273 0x02D8 ,
274 0x02D9 ,
275 0x02DA ,
276 0x00B8 ,
277 0x02DD ,
278 0x02DB ,
279 0x02C7
280};
281
282// ------------------------------------------------------------------------------------------------
283bool STEP::StringToUTF8(std::string& s)
284{
285 // very basic handling for escaped string sequences
286 // http://doc.spatial.com/index.php?title=InterOp:Connect/STEP&redirect=no
287
288 for (size_t i = 0; i < s.size(); ) {
289 if (s[i] == '\\') {
290 // \S\X - cp1252 (X is the character remapped to [0,127])
291 if (i+3 < s.size() && s[i+1] == 'S' && s[i+2] == '\\') {
292 // http://stackoverflow.com/questions/5586214/how-to-convert-char-from-iso-8859-1-to-utf-8-in-c-multiplatformly
293 ai_assert((uint8_t)s[i+3] < 0x80);
294 const uint8_t ch = s[i+3] + 0x80;
295
296 s[i] = 0xc0 | (ch & 0xc0) >> 6;
297 s[i+1] = 0x80 | (ch & 0x3f);
298
299 s.erase(i + 2,2);
300 ++i;
301 }
302 // \X\xx - mac/roman (xx is a hex sequence)
303 else if (i+4 < s.size() && s[i+1] == 'X' && s[i+2] == '\\') {
304
305 const uint8_t macval = HexOctetToDecimal(s.c_str() + i + 3);
306 if(macval < 0x20) {
307 return false;
308 }
309
310 ai_assert(sizeof(mac_codetable) / sizeof(mac_codetable[0]) == 0x100-0x20);
311
312 const uint32_t unival = mac_codetable[macval - 0x20], *univalp = &unival;
313
314 unsigned char temp[5], *tempp = temp;
315 ai_assert(sizeof( unsigned char ) == 1);
316
317 utf8::utf32to8( univalp, univalp + 1, tempp );
318
319 const size_t outcount = static_cast<size_t>(tempp-temp);
320
321 s.erase(i,5);
322 s.insert(i, reinterpret_cast<char*>(temp), outcount);
323 i += outcount;
324 }
325 // \Xn\ .. \X0\ - various unicode encodings (n=2: utf16; n=4: utf32)
326 else if (i+3 < s.size() && s[i+1] == 'X' && s[i+2] >= '0' && s[i+2] <= '9') {
327 switch(s[i+2]) {
328 // utf16
329 case '2':
330 // utf32
331 case '4':
332 if (s[i+3] == '\\') {
333 const size_t basei = i+4;
334 size_t j = basei, jend = s.size()-3;
335
336 for (; j < jend; ++j) {
337 if (s[j] == '\\' && s[j+1] == 'X' && s[j+2] == '0' && s[j+3] == '\\') {
338 break;
339 }
340 }
341 if (j == jend) {
342 return false;
343 }
344
345 if (j == basei) {
346 s.erase(i,8);
347 continue;
348 }
349
350 if (s[i+2] == '2') {
351 if (((j - basei) % 4) != 0) {
352 return false;
353 }
354
355 const size_t count = (j-basei)/4;
356 std::unique_ptr<uint16_t[]> src(new uint16_t[count]);
357
358 const char* cur = s.c_str() + basei;
359 for (size_t k = 0; k < count; ++k, cur += 4) {
360 src[k] = (static_cast<uint16_t>(HexOctetToDecimal(cur)) << 8u) |
361 static_cast<uint16_t>(HexOctetToDecimal(cur+2));
362 }
363
364 const size_t dcount = count * 3; // this is enough to hold all possible outputs
365 std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]);
366
367 const uint16_t* srct = src.get();
368 unsigned char* destt = dest.get();
369 utf8::utf16to8( srct, srct + count, destt );
370
371 const size_t outcount = static_cast<size_t>(destt-dest.get());
372
373 s.erase(i,(j+4-i));
374
375 ai_assert(sizeof(unsigned char) == 1);
376 s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
377
378 i += outcount;
379 continue;
380 }
381 else if (s[i+2] == '4') {
382 if (((j - basei) % 8) != 0) {
383 return false;
384 }
385
386 const size_t count = (j-basei)/8;
387 std::unique_ptr<uint32_t[]> src(new uint32_t[count]);
388
389 const char* cur = s.c_str() + basei;
390 for (size_t k = 0; k < count; ++k, cur += 8) {
391 src[k] = (static_cast<uint32_t>(HexOctetToDecimal(cur )) << 24u) |
392 (static_cast<uint32_t>(HexOctetToDecimal(cur+2)) << 16u) |
393 (static_cast<uint32_t>(HexOctetToDecimal(cur+4)) << 8u) |
394 (static_cast<uint32_t>(HexOctetToDecimal(cur+6)));
395 }
396
397 const size_t dcount = count * 5; // this is enough to hold all possible outputs
398 std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]);
399
400 const uint32_t* srct = src.get();
401 unsigned char* destt = dest.get();
402 utf8::utf32to8( srct, srct + count, destt );
403
404 const size_t outcount = static_cast<size_t>(destt-dest.get());
405
406 s.erase(i,(j+4-i));
407
408 ai_assert(sizeof(unsigned char) == 1);
409 s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
410
411 i += outcount;
412 continue;
413 }
414 }
415 break;
416
417 // TODO: other encoding patterns?
418
419 default:
420 return false;
421 }
422 }
423 }
424 ++i;
425 }
426 return true;
427}
428