/* * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ /** * @mainpage * * utf8proc is a free/open-source (MIT/expat licensed) C library * providing Unicode normalization, case-folding, and other operations * for strings in the UTF-8 encoding, supporting Unicode version * 7.0.0. See the utf8proc home page (http://julialang.org/utf8proc/) * for downloads and other information, or the source code on github * (https://github.com/JuliaLang/utf8proc). * * For the utf8proc API documentation, see: @ref utf8proc.h * * The features of utf8proc include: * * - Transformation of strings (@ref utf8proc_map) to: * - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character) * - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT) * - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK) * - case-folding (@ref UTF8PROC_CASEFOLD) * - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC * - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND) * - Character-width computation: @ref utf8proc_charwidth * - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string * - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8. */ /** @file */ #ifndef UTF8PROC_H #define UTF8PROC_H /** @name API version * * The utf8proc API version MAJOR.MINOR.PATCH, following * semantic-versioning rules (http://semver.org) based on API * compatibility. * * This is also returned at runtime by @ref utf8proc_version; however, the * runtime version may append a string like "-dev" to the version number * for prerelease versions. * * @note The shared-library version number in the Makefile may be different, * being based on ABI compatibility rather than API compatibility. */ /** @{ */ /** The MAJOR version number (increased when backwards API compatibility is broken). */ #define UTF8PROC_VERSION_MAJOR 1 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ #define UTF8PROC_VERSION_MINOR 3 /** The PATCH version (increased for fixes that do not change the API). */ #define UTF8PROC_VERSION_PATCH 0 /** @} */ #include #include #ifdef _MSC_VER typedef signed char utf8proc_int8_t; typedef unsigned char utf8proc_uint8_t; typedef short utf8proc_int16_t; typedef unsigned short utf8proc_uint16_t; typedef int utf8proc_int32_t; typedef unsigned int utf8proc_uint32_t; # ifdef _WIN64 typedef __int64 utf8proc_ssize_t; typedef unsigned __int64 utf8proc_size_t; # else typedef int utf8proc_ssize_t; typedef unsigned int utf8proc_size_t; # endif # ifndef __cplusplus typedef unsigned char utf8proc_bool; enum {false, true}; # else typedef bool utf8proc_bool; # endif #else #ifdef __cplusplus # include # include #else # include # include #endif typedef int8_t utf8proc_int8_t; typedef uint8_t utf8proc_uint8_t; typedef int16_t utf8proc_int16_t; typedef uint16_t utf8proc_uint16_t; typedef int32_t utf8proc_int32_t; typedef uint32_t utf8proc_uint32_t; typedef size_t utf8proc_size_t; typedef ssize_t utf8proc_ssize_t; typedef bool utf8proc_bool; #endif #ifdef __cplusplus # include #else # include #endif /** @name Error codes * Error codes being returned by almost all functions. */ /** @{ */ /** Memory could not be allocated. */ #define UTF8PROC_ERROR_NOMEM -1 /** The given string is too long to be processed. */ #define UTF8PROC_ERROR_OVERFLOW -2 /** The given string is not a legal UTF-8 string. */ #define UTF8PROC_ERROR_INVALIDUTF8 -3 /** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */ #define UTF8PROC_ERROR_NOTASSIGNED -4 /** Invalid options have been used. */ #define UTF8PROC_ERROR_INVALIDOPTS -5 /** @} */ #define UTF8PROC_cont(ch) (((ch) & 0xc0) == 0x80) /** * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`. * The maximum number of bytes read is `strlen`, unless `strlen` is * negative (in which case up to 4 bytes are read). * * If a valid codepoint could be read, it is stored in the variable * pointed to by `codepoint_ref`, otherwise that variable will be set to -1. * In case of success, the number of bytes read is returned; otherwise, a * negative error code is returned. */ static inline utf8proc_ssize_t utf8proc_iterate( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst ) { utf8proc_uint32_t uc; const utf8proc_uint8_t *end; *dst = -1; if (!strlen) return 0; end = str + ((strlen < 0) ? 4 : strlen); uc = *str++; if (uc < 0x80) { *dst = uc; return 1; } // Must be between 0xc2 and 0xf4 inclusive to be valid if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; if (uc < 0xe0) { // 2-byte sequence // Must have valid continuation character if (!UTF8PROC_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; *dst = ((uc & 0x1f)<<6) | (*str & 0x3f); return 2; } if (uc < 0xf0) { // 3-byte sequence if ((str + 1 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1])) return UTF8PROC_ERROR_INVALIDUTF8; // Check for surrogate chars if (uc == 0xed && *str > 0x9f) return UTF8PROC_ERROR_INVALIDUTF8; uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f); if (uc < 0x800) return UTF8PROC_ERROR_INVALIDUTF8; *dst = uc; return 3; } // 4-byte sequence // Must have 3 valid continuation characters if ((str + 2 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]) || !UTF8PROC_cont(str[2])) return UTF8PROC_ERROR_INVALIDUTF8; // Make sure in correct range (0x10000 - 0x10ffff) if (uc == 0xf0) { if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; } else if (uc == 0xf4) { if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; } *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f); return 4; } /** * Encodes the codepoint as an UTF-8 string in the byte array pointed * to by `dst`. This array must be at least 4 bytes long. * * In case of success the number of bytes written is returned, and * otherwise 0 is returned. * * This function does not check whether `codepoint` is valid Unicode. */ static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { if (uc < 0x00) { return 0; } else if (uc < 0x80) { dst[0] = uc; return 1; } else if (uc < 0x800) { dst[0] = 0xC0 + (uc >> 6); dst[1] = 0x80 + (uc & 0x3F); return 2; // Note: we allow encoding 0xd800-0xdfff here, so as not to change // the API, however, these are actually invalid in UTF-8 } else if (uc < 0x10000) { dst[0] = 0xE0 + (uc >> 12); dst[1] = 0x80 + ((uc >> 6) & 0x3F); dst[2] = 0x80 + (uc & 0x3F); return 3; } else if (uc < 0x110000) { dst[0] = 0xF0 + (uc >> 18); dst[1] = 0x80 + ((uc >> 12) & 0x3F); dst[2] = 0x80 + ((uc >> 6) & 0x3F); dst[3] = 0x80 + (uc & 0x3F); return 4; } else return 0; } #ifdef __cplusplus #include #include class UTF8Iterator { std::string::const_iterator m_it; public: using iterator_category = std::forward_iterator_tag; using value_type = uint32_t; using difference_type = std::ptrdiff_t; using pointer = uint32_t*; using reference = uint32_t&; UTF8Iterator(const std::string::const_iterator& it) : m_it(it) {} UTF8Iterator& operator+=(size_t v) { for (size_t i=0 ; i(&*m_it), -1, &dummy); #ifndef NDEBUG if (*m_it == '\0') { fprintf(stderr, "ERROR! UTF8-iterator null-term fail\n"); abort(); } else if (sz > 0) m_it += sz; else { fprintf(stderr, "ERROR! UTF8Iterator character fail"); abort(); } #else if (sz > 0) m_it += sz; #endif } return *this; } UTF8Iterator& operator++() { return this->operator+=(1); } UTF8Iterator operator+(size_t v) const { UTF8Iterator ret(m_it); ret += v; return ret; } uint32_t operator*() const { utf8proc_int32_t ret; utf8proc_iterate(reinterpret_cast(&*m_it), -1, &ret); return ret; } std::string::const_iterator iter() const {return m_it;} size_t countTo(std::string::const_iterator end) const { UTF8Iterator it(m_it); size_t ret = 0; while (it.iter() < end) { ++ret; ++it; } return ret; } }; #endif #endif