//===-- runtime/utf.cpp ---------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "utf.h" namespace Fortran::runtime { // clang-format off const std::uint8_t UTF8FirstByteTable[256]{ /* 00 - 7F: 7 bit payload in single byte */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 - BF: invalid first byte, valid later byte */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C0 - DF: 11 bit payload */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* E0 - EF: 16 bit payload */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4, /* F8 - FB: 26 bit payload */ 5, 5, 5, 5, /* FC - FD: 31 bit payload */ 6, 6, /* FE: 32 bit payload */ 7, /* FF: invalid */ 0 }; // clang-format on // Non-minimal encodings are accepted. std::optional DecodeUTF8(const char *p0) { const std::uint8_t *p{reinterpret_cast(p0)}; std::size_t bytes{MeasureUTF8Bytes(*p0)}; if (bytes == 1) { return char32_t{*p}; } else if (bytes > 1) { std::uint64_t result{char32_t{*p} & (0x7f >> bytes)}; for (std::size_t j{1}; j < bytes; ++j) { std::uint8_t next{p[j]}; if (next < 0x80 || next > 0xbf) { return std::nullopt; } result = (result << 6) | (next & 0x3f); } if (result <= 0xffffffff) { return static_cast(result); } } return std::nullopt; } std::size_t EncodeUTF8(char *p0, char32_t ucs) { std::uint8_t *p{reinterpret_cast(p0)}; if (ucs <= 0x7f) { p[0] = ucs; return 1; } else if (ucs <= 0x7ff) { p[0] = 0xc0 | (ucs >> 6); p[1] = 0x80 | (ucs & 0x3f); return 2; } else if (ucs <= 0xffff) { p[0] = 0xe0 | (ucs >> 12); p[1] = 0x80 | ((ucs >> 6) & 0x3f); p[2] = 0x80 | (ucs & 0x3f); return 3; } else if (ucs <= 0x1fffff) { p[0] = 0xf0 | (ucs >> 18); p[1] = 0x80 | ((ucs >> 12) & 0x3f); p[2] = 0x80 | ((ucs >> 6) & 0x3f); p[3] = 0x80 | (ucs & 0x3f); return 4; } else if (ucs <= 0x3ffffff) { p[0] = 0xf8 | (ucs >> 24); p[1] = 0x80 | ((ucs >> 18) & 0x3f); p[2] = 0x80 | ((ucs >> 12) & 0x3f); p[3] = 0x80 | ((ucs >> 6) & 0x3f); p[4] = 0x80 | (ucs & 0x3f); return 5; } else if (ucs <= 0x7ffffff) { p[0] = 0xf8 | (ucs >> 30); p[1] = 0x80 | ((ucs >> 24) & 0x3f); p[2] = 0x80 | ((ucs >> 18) & 0x3f); p[3] = 0x80 | ((ucs >> 12) & 0x3f); p[4] = 0x80 | ((ucs >> 6) & 0x3f); p[5] = 0x80 | (ucs & 0x3f); return 6; } else { p[0] = 0xfe; p[1] = 0x80 | ((ucs >> 30) & 0x3f); p[2] = 0x80 | ((ucs >> 24) & 0x3f); p[3] = 0x80 | ((ucs >> 18) & 0x3f); p[4] = 0x80 | ((ucs >> 12) & 0x3f); p[5] = 0x80 | ((ucs >> 6) & 0x3f); p[6] = 0x80 | (ucs & 0x3f); return 7; } } } // namespace Fortran::runtime