/* * * Embedded Linux library * * Copyright (C) 2011-2014 Intel Corporation. All rights reserved. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include "strv.h" #include "utf8.h" #include "private.h" #include "useful.h" /** * SECTION:utf8 * @short_description: UTF-8 utility function * * UTF-8 string handling support */ LIB_EXPORT unsigned char l_ascii_table[256] = { [0x00 ... 0x08] = L_ASCII_CNTRL, [0x09 ... 0x0D] = L_ASCII_CNTRL | L_ASCII_SPACE, [0x0E ... 0x1F] = L_ASCII_CNTRL, [0x20] = L_ASCII_PRINT | L_ASCII_SPACE, [0x21 ... 0x2F] = L_ASCII_PRINT | L_ASCII_PUNCT, [0x30 ... 0x39] = L_ASCII_DIGIT | L_ASCII_XDIGIT | L_ASCII_PRINT, [0x3A ... 0x40] = L_ASCII_PRINT | L_ASCII_PUNCT, [0x41 ... 0x46] = L_ASCII_PRINT | L_ASCII_XDIGIT | L_ASCII_UPPER, [0x47 ... 0x5A] = L_ASCII_PRINT | L_ASCII_UPPER, [0x5B ... 0x60] = L_ASCII_PRINT | L_ASCII_PUNCT, [0x61 ... 0x66] = L_ASCII_PRINT | L_ASCII_XDIGIT | L_ASCII_LOWER, [0x67 ... 0x7A] = L_ASCII_PRINT | L_ASCII_LOWER, [0x7B ... 0x7E] = L_ASCII_PRINT | L_ASCII_PUNCT, [0x7F] = L_ASCII_CNTRL, [0x80 ... 0xFF] = 0, }; static inline bool __attribute__ ((always_inline)) valid_unicode(wchar_t c) { if (c <= 0xd7ff) return true; if (c < 0xe000 || c > 0x10ffff) return false; if (c >= 0xfdd0 && c <= 0xfdef) return false; if ((c & 0xfffe) == 0xfffe) return false; return true; } /** * l_utf8_get_codepoint * @str: a pointer to codepoint data * @len: maximum bytes to read * @cp: destination for codepoint * * Returns: number of bytes read, or -1 for invalid coddepoint **/ LIB_EXPORT int l_utf8_get_codepoint(const char *str, size_t len, wchar_t *cp) { static const wchar_t mins[3] = { 1 << 7, 1 << 11, 1 << 16 }; unsigned int expect_bytes; wchar_t val; size_t i; if (len == 0) return 0; if ((signed char) str[0] > 0) { *cp = str[0]; return 1; } expect_bytes = __builtin_clz(~((unsigned char)str[0] << 24)); if (expect_bytes < 2 || expect_bytes > 4) goto error; if (expect_bytes > len) goto error; val = str[0] & (0xff >> (expect_bytes + 1)); for (i = 1; i < expect_bytes; i++) { if ((str[i] & 0xc0) != 0x80) goto error; val <<= 6; val |= str[i] & 0x3f; } if (val < mins[expect_bytes - 2]) goto error; if (valid_unicode(val) == false) goto error; *cp = val; return expect_bytes; error: return -1; } /** * l_utf8_validate: * @str: a pointer to character data * @len: max bytes to validate * @end: return location for end of valid data * * Validates UTF-8 encoded text. If @end is non-NULL, then the end of * the valid range will be stored there (i.e. the start of the first * invalid character if some bytes were invalid, or the end of the text * being validated otherwise). * * Returns: Whether the text was valid UTF-8 **/ LIB_EXPORT bool l_utf8_validate(const char *str, size_t len, const char **end) { size_t pos = 0; int ret; wchar_t val; while (pos < len && str[pos]) { ret = l_utf8_get_codepoint(str + pos, len - pos, &val); if (ret < 0) goto error; pos += ret; } error: if (end) *end = str + pos; if (pos != len) return false; return true; } /** * l_utf8_strlen: * @str: a pointer to character data * * Computes the number of UTF-8 characters (not bytes) in the string given * by @str. * * Returns: The number of UTF-8 characters in the string **/ LIB_EXPORT size_t l_utf8_strlen(const char *str) { size_t l = 0; size_t i; unsigned char b; for (i = 0; str[i]; i++) { b = str[i]; if ((b >> 6) == 2) l += 1; } return i - l; } static inline int __attribute__ ((always_inline)) utf8_length(wchar_t c) { if (c <= 0x7f) return 1; if (c <= 0x7ff) return 2; if (c <= 0xffff) return 3; return 4; } static inline uint16_t __attribute__ ((always_inline)) surrogate_value(uint16_t h, uint16_t l) { return 0x10000 + (h - 0xd800) * 0x400 + l - 0xdc00; } /* * l_utf8_from_wchar: * @c: a wide-character to convert * @out_buf: Buffer to write out to * * Assumes c is valid unicode and out_buf contains enough space for a single * utf8 character (maximum 4 bytes) * Returns: number of characters written */ LIB_EXPORT size_t l_utf8_from_wchar(wchar_t c, char *out_buf) { int len = utf8_length(c); int i; if (len == 1) { out_buf[0] = c; return 1; } for (i = len - 1; i; i--) { out_buf[i] = (c & 0x3f) | 0x80; c >>= 6; } out_buf[0] = (0xff << (8 - len)) | c; return len; } /** * l_utf8_from_utf16: * @utf16: Array of UTF16 characters * @utf16_size: The size of the @utf16 array in bytes. Must be a multiple of 2. * * Returns: A newly-allocated buffer containing UTF16 encoded string converted * to UTF8. The UTF8 string will always be null terminated, even if the * original UTF16 string was not. **/ LIB_EXPORT char *l_utf8_from_utf16(const void *utf16, ssize_t utf16_size) { char *utf8; size_t utf8_len = 0; wchar_t high_surrogate = 0; ssize_t i = 0; uint16_t in; wchar_t c; if (unlikely(utf16_size % 2)) return NULL; while (utf16_size < 0 || i < utf16_size) { in = l_get_u16(utf16 + i); if (!in) break; if (in >= 0xdc00 && in < 0xe000) { if (high_surrogate) c = surrogate_value(high_surrogate, in); else return NULL; high_surrogate = 0; } else { if (high_surrogate) return NULL; if (in >= 0xd800 && in < 0xdc00) { high_surrogate = in; goto next; } c = in; } if (!valid_unicode(c)) return NULL; utf8_len += utf8_length(c); next: i += 2; } if (high_surrogate) return NULL; utf8 = l_malloc(utf8_len + 1); utf8_len = 0; i = 0; while (utf16_size < 0 || i < utf16_size) { in = l_get_u16(utf16 + i); if (!in) break; if (in >= 0xd800 && in < 0xdc00) { high_surrogate = in; i += 2; in = l_get_u16(utf16 + i); c = surrogate_value(high_surrogate, in); } else c = in; utf8_len += l_utf8_from_wchar(c, utf8 + utf8_len); i += 2; } utf8[utf8_len] = '\0'; return utf8; } /** * l_utf8_to_utf16: * @utf8: UTF8 formatted string * @out_size: The size in bytes of the converted utf16 string * * Converts a UTF8 formatted string to UTF16. It is assumed that the string * is valid UTF8 and no sanity checking is performed. * * Returns: A newly-allocated buffer containing UTF8 encoded string converted * to UTF16. The UTF16 string will always be null terminated. **/ LIB_EXPORT void *l_utf8_to_utf16(const char *utf8, size_t *out_size) { const char *c; wchar_t wc; int len; uint16_t *utf16; size_t n_utf16; if (unlikely(!utf8)) return NULL; c = utf8; n_utf16 = 0; while (*c) { len = l_utf8_get_codepoint(c, 4, &wc); if (len < 0) return NULL; if (wc < 0x10000) n_utf16 += 1; else n_utf16 += 2; c += len; } utf16 = l_malloc((n_utf16 + 1) * 2); c = utf8; n_utf16 = 0; while (*c) { len = l_utf8_get_codepoint(c, 4, &wc); if (wc >= 0x10000) { utf16[n_utf16++] = (wc - 0x1000) / 0x400 + 0xd800; utf16[n_utf16++] = (wc - 0x1000) % 0x400 + 0xdc00; } else utf16[n_utf16++] = wc; c += len; } utf16[n_utf16] = 0; if (out_size) *out_size = (n_utf16 + 1) * 2; return utf16; } /** * l_utf8_from_ucs2be: * @ucs2be: Array of UCS2 characters in big-endian format * @ucs2be_size: The size of the @ucs2 array in bytes. Must be a multiple of 2. * * Returns: A newly-allocated buffer containing UCS2BE encoded string converted * to UTF8. The UTF8 string will always be null terminated, even if the * original UCS2BE string was not. **/ LIB_EXPORT char *l_utf8_from_ucs2be(const void *ucs2be, ssize_t ucs2be_size) { char *utf8; size_t utf8_len = 0; ssize_t i = 0; uint16_t in; if (unlikely(ucs2be_size % 2)) return NULL; while (ucs2be_size < 0 || i < ucs2be_size) { in = l_get_be16(ucs2be + i); if (!in) break; if (in >= 0xd800 && in < 0xe000) return NULL; if (!valid_unicode(in)) return NULL; utf8_len += utf8_length(in); i += 2; } utf8 = l_malloc(utf8_len + 1); utf8_len = 0; i = 0; while (ucs2be_size < 0 || i < ucs2be_size) { in = l_get_be16(ucs2be + i); if (!in) break; utf8_len += l_utf8_from_wchar(in, utf8 + utf8_len); i += 2; } utf8[utf8_len] = '\0'; return utf8; } /** * l_utf8_to_ucs2be: * @utf8: UTF8 formatted string * @out_size: The size in bytes of the converted ucs2be string * * Converts a UTF8 formatted string to UCS2BE. It is assumed that the string * is valid UTF8 and no sanity checking is performed. * * Returns: A newly-allocated buffer containing UTF8 encoded string converted * to UCS2BE. The UCS2BE string will always be null terminated. **/ LIB_EXPORT void *l_utf8_to_ucs2be(const char *utf8, size_t *out_size) { const char *c; wchar_t wc; int len; uint16_t *ucs2be; size_t n_ucs2be; if (unlikely(!utf8)) return NULL; c = utf8; n_ucs2be = 0; while (*c) { len = l_utf8_get_codepoint(c, 4, &wc); if (len < 0) return NULL; if (wc >= 0x10000) return NULL; n_ucs2be += 1; c += len; } ucs2be = l_malloc((n_ucs2be + 1) * 2); c = utf8; n_ucs2be = 0; while (*c) { len = l_utf8_get_codepoint(c, 4, &wc); ucs2be[n_ucs2be++] = L_CPU_TO_BE16(wc); c += len; } ucs2be[n_ucs2be] = 0; if (out_size) *out_size = (n_ucs2be + 1) * 2; return ucs2be; }