1//===-- runtime/utf.cpp ---------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "utf.h"
10
11namespace Fortran::runtime {
12
13// clang-format off
14RT_OFFLOAD_VAR_GROUP_BEGIN
15const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
16 /* 00 - 7F: 7 bit payload in single byte */
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
25 /* 80 - BF: invalid first byte, valid later byte */
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30 /* C0 - DF: 11 bit payload */
31 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
32 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
33 /* E0 - EF: 16 bit payload */
34 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
35 /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
36 /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
37 /* FC - FD: 31 bit payload */ 6, 6,
38 /* FE: 32 bit payload */ 7,
39 /* FF: invalid */ 0
40};
41RT_OFFLOAD_VAR_GROUP_END
42// clang-format on
43
44RT_OFFLOAD_API_GROUP_BEGIN
45// Non-minimal encodings are accepted.
46Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
47 const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
48 std::size_t bytes{MeasureUTF8Bytes(*p0)};
49 if (bytes == 1) {
50 return char32_t{*p};
51 } else if (bytes > 1) {
52 std::uint64_t result{char32_t{*p} & (0x7f >> bytes)};
53 for (std::size_t j{1}; j < bytes; ++j) {
54 std::uint8_t next{p[j]};
55 if (next < 0x80 || next > 0xbf) {
56 return Fortran::common::nullopt;
57 }
58 result = (result << 6) | (next & 0x3f);
59 }
60 if (result <= 0xffffffff) {
61 return static_cast<char32_t>(result);
62 }
63 }
64 return Fortran::common::nullopt;
65}
66
67std::size_t EncodeUTF8(char *p0, char32_t ucs) {
68 std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)};
69 if (ucs <= 0x7f) {
70 p[0] = ucs;
71 return 1;
72 } else if (ucs <= 0x7ff) {
73 p[0] = 0xc0 | (ucs >> 6);
74 p[1] = 0x80 | (ucs & 0x3f);
75 return 2;
76 } else if (ucs <= 0xffff) {
77 p[0] = 0xe0 | (ucs >> 12);
78 p[1] = 0x80 | ((ucs >> 6) & 0x3f);
79 p[2] = 0x80 | (ucs & 0x3f);
80 return 3;
81 } else if (ucs <= 0x1fffff) {
82 p[0] = 0xf0 | (ucs >> 18);
83 p[1] = 0x80 | ((ucs >> 12) & 0x3f);
84 p[2] = 0x80 | ((ucs >> 6) & 0x3f);
85 p[3] = 0x80 | (ucs & 0x3f);
86 return 4;
87 } else if (ucs <= 0x3ffffff) {
88 p[0] = 0xf8 | (ucs >> 24);
89 p[1] = 0x80 | ((ucs >> 18) & 0x3f);
90 p[2] = 0x80 | ((ucs >> 12) & 0x3f);
91 p[3] = 0x80 | ((ucs >> 6) & 0x3f);
92 p[4] = 0x80 | (ucs & 0x3f);
93 return 5;
94 } else if (ucs <= 0x7ffffff) {
95 p[0] = 0xf8 | (ucs >> 30);
96 p[1] = 0x80 | ((ucs >> 24) & 0x3f);
97 p[2] = 0x80 | ((ucs >> 18) & 0x3f);
98 p[3] = 0x80 | ((ucs >> 12) & 0x3f);
99 p[4] = 0x80 | ((ucs >> 6) & 0x3f);
100 p[5] = 0x80 | (ucs & 0x3f);
101 return 6;
102 } else {
103 p[0] = 0xfe;
104 p[1] = 0x80 | ((ucs >> 30) & 0x3f);
105 p[2] = 0x80 | ((ucs >> 24) & 0x3f);
106 p[3] = 0x80 | ((ucs >> 18) & 0x3f);
107 p[4] = 0x80 | ((ucs >> 12) & 0x3f);
108 p[5] = 0x80 | ((ucs >> 6) & 0x3f);
109 p[6] = 0x80 | (ucs & 0x3f);
110 return 7;
111 }
112}
113RT_OFFLOAD_API_GROUP_END
114
115} // namespace Fortran::runtime
116

source code of flang/runtime/utf.cpp