diff --git a/include/athena/utf8proc.h b/include/athena/utf8proc.h index 9ca5980..4debe2f 100644 --- a/include/athena/utf8proc.h +++ b/include/athena/utf8proc.h @@ -20,8 +20,7 @@ * DEALINGS IN THE SOFTWARE. */ - -/** +/** * @mainpage * * utf8proc is a free/open-source (MIT/expat licensed) C library @@ -36,9 +35,11 @@ * The features of utf8proc include: * * - Transformation of strings (@ref utf8proc_map) to: - * - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character) + * - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters + * (http://en.wikipedia.org/wiki/Combining_character) * - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT) - * - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK) + * - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining + * characters such as accents (@ref UTF8PROC_STRIPMARK) * - case-folding (@ref UTF8PROC_CASEFOLD) * - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC * - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND) @@ -53,7 +54,7 @@ #define UTF8PROC_H /** @name API version - * + * * The utf8proc API version MAJOR.MINOR.PATCH, following * semantic-versioning rules (http://semver.org) based on API * compatibility. @@ -83,26 +84,25 @@ typedef short utf8proc_int16_t; typedef unsigned short utf8proc_uint16_t; typedef int utf8proc_int32_t; typedef unsigned int utf8proc_uint32_t; -# ifdef _WIN64 +#ifdef _WIN64 typedef __int64 utf8proc_ssize_t; typedef unsigned __int64 utf8proc_size_t; -# else +#else typedef int utf8proc_ssize_t; typedef unsigned int utf8proc_size_t; -# endif -# ifndef __cplusplus +#endif +#ifndef __cplusplus typedef unsigned char utf8proc_bool; -enum {false, true}; -# else +enum { false, true }; +#else typedef bool utf8proc_bool; -# endif +#endif #else #ifdef __cplusplus -# include -# include +#include #else -# include -# include +#include +#include #endif typedef int8_t utf8proc_int8_t; typedef uint8_t utf8proc_uint8_t; @@ -115,9 +115,9 @@ typedef ssize_t utf8proc_ssize_t; typedef bool utf8proc_bool; #endif #ifdef __cplusplus -# include +#include #else -# include +#include #endif /** @name Error codes @@ -136,7 +136,7 @@ typedef bool utf8proc_bool; #define UTF8PROC_ERROR_INVALIDOPTS -5 /** @} */ -#define UTF8PROC_cont(ch) (((ch) & 0xc0) == 0x80) +#define UTF8PROC_cont(ch) (((ch) & 0xc0) == 0x80) /** * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`. @@ -148,14 +148,14 @@ typedef bool utf8proc_bool; * In case of success, the number of bytes read is returned; otherwise, a * negative error code is returned. */ -static inline utf8proc_ssize_t utf8proc_iterate( - const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst -) { +static inline utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t* str, utf8proc_ssize_t strlen, + utf8proc_int32_t* dst) { utf8proc_uint32_t uc; - const utf8proc_uint8_t *end; + const utf8proc_uint8_t* end; *dst = -1; - if (!strlen) return 0; + if (!strlen) + return 0; end = str + ((strlen < 0) ? 4 : strlen); uc = *str++; if (uc < 0x80) { @@ -163,36 +163,40 @@ static inline utf8proc_ssize_t utf8proc_iterate( return 1; } // Must be between 0xc2 and 0xf4 inclusive to be valid - if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; - if (uc < 0xe0) { // 2-byte sequence - // Must have valid continuation character - if (!UTF8PROC_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; - *dst = ((uc & 0x1f)<<6) | (*str & 0x3f); - return 2; + if ((uc - 0xc2) > (0xf4 - 0xc2)) + return UTF8PROC_ERROR_INVALIDUTF8; + if (uc < 0xe0) { // 2-byte sequence + // Must have valid continuation character + if (!UTF8PROC_cont(*str)) + return UTF8PROC_ERROR_INVALIDUTF8; + *dst = ((uc & 0x1f) << 6) | (*str & 0x3f); + return 2; } - if (uc < 0xf0) { // 3-byte sequence - if ((str + 1 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1])) - return UTF8PROC_ERROR_INVALIDUTF8; - // Check for surrogate chars - if (uc == 0xed && *str > 0x9f) - return UTF8PROC_ERROR_INVALIDUTF8; - uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f); - if (uc < 0x800) - return UTF8PROC_ERROR_INVALIDUTF8; - *dst = uc; - return 3; + if (uc < 0xf0) { // 3-byte sequence + if ((str + 1 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1])) + return UTF8PROC_ERROR_INVALIDUTF8; + // Check for surrogate chars + if (uc == 0xed && *str > 0x9f) + return UTF8PROC_ERROR_INVALIDUTF8; + uc = ((uc & 0xf) << 12) | ((*str & 0x3f) << 6) | (str[1] & 0x3f); + if (uc < 0x800) + return UTF8PROC_ERROR_INVALIDUTF8; + *dst = uc; + return 3; } // 4-byte sequence // Must have 3 valid continuation characters if ((str + 2 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]) || !UTF8PROC_cont(str[2])) - return UTF8PROC_ERROR_INVALIDUTF8; + return UTF8PROC_ERROR_INVALIDUTF8; // Make sure in correct range (0x10000 - 0x10ffff) if (uc == 0xf0) { - if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; + if (*str < 0x90) + return UTF8PROC_ERROR_INVALIDUTF8; } else if (uc == 0xf4) { - if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; + if (*str > 0x8f) + return UTF8PROC_ERROR_INVALIDUTF8; } - *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f); + *dst = ((uc & 7) << 18) | ((*str & 0x3f) << 12) | ((str[1] & 0x3f) << 6) | (str[2] & 0x3f); return 4; } @@ -205,7 +209,7 @@ static inline utf8proc_ssize_t utf8proc_iterate( * * This function does not check whether `codepoint` is valid Unicode. */ -static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { +static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t* dst) { if (uc < 0x00) { return 0; } else if (uc < 0x80) { @@ -215,8 +219,8 @@ static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8pro dst[0] = 0xC0 + (uc >> 6); dst[1] = 0x80 + (uc & 0x3F); return 2; - // Note: we allow encoding 0xd800-0xdfff here, so as not to change - // the API, however, these are actually invalid in UTF-8 + // Note: we allow encoding 0xd800-0xdfff here, so as not to change + // the API, however, these are actually invalid in UTF-8 } else if (uc < 0x10000) { dst[0] = 0xE0 + (uc >> 12); dst[1] = 0x80 + ((uc >> 6) & 0x3F); @@ -228,81 +232,69 @@ static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8pro dst[2] = 0x80 + ((uc >> 6) & 0x3F); dst[3] = 0x80 + (uc & 0x3F); return 4; - } else return 0; + } else + return 0; } #ifdef __cplusplus #include #include -class UTF8Iterator -{ - std::string_view::const_iterator m_it; -public: - using iterator_category = std::forward_iterator_tag; - using value_type = uint32_t; - using difference_type = std::ptrdiff_t; - using pointer = uint32_t*; - using reference = uint32_t&; +class UTF8Iterator { + std::string_view::const_iterator m_it; - UTF8Iterator(const std::string_view::const_iterator& it) : m_it(it) {} - UTF8Iterator& operator+=(size_t v) - { - for (size_t i=0 ; i(&*m_it), -1, &dummy); +public: + using iterator_category = std::forward_iterator_tag; + using value_type = uint32_t; + using difference_type = std::ptrdiff_t; + using pointer = uint32_t*; + using reference = uint32_t&; + + UTF8Iterator(const std::string_view::const_iterator& it) : m_it(it) {} + UTF8Iterator& operator+=(size_t v) { + for (size_t i = 0; i < v; ++i) { + utf8proc_int32_t dummy; + utf8proc_ssize_t sz = utf8proc_iterate(reinterpret_cast(&*m_it), -1, &dummy); #ifndef NDEBUG - if (*m_it == '\0') - { - fprintf(stderr, "ERROR! UTF8-iterator null-term fail\n"); - abort(); - } - else if (sz > 0) - m_it += sz; - else - { - fprintf(stderr, "ERROR! UTF8Iterator character fail"); - abort(); - } + if (*m_it == '\0') { + fprintf(stderr, "ERROR! UTF8-iterator null-term fail\n"); + abort(); + } else if (sz > 0) + m_it += sz; + else { + fprintf(stderr, "ERROR! UTF8Iterator character fail"); + abort(); + } #else - if (sz > 0) - m_it += sz; + if (sz > 0) + m_it += sz; #endif - } - return *this; } - UTF8Iterator& operator++() - { - return this->operator+=(1); - } - UTF8Iterator operator+(size_t v) const - { - UTF8Iterator ret(m_it); - ret += v; - return ret; - } - uint32_t operator*() const - { - utf8proc_int32_t ret; - utf8proc_iterate(reinterpret_cast(&*m_it), -1, &ret); - return ret; - } - std::string_view::const_iterator iter() const {return m_it;} - size_t countTo(std::string_view::const_iterator end) const - { - UTF8Iterator it(m_it); - size_t ret = 0; - while (it.iter() < end && *it != '\0') - { - ++ret; - ++it; - } - return ret; + return *this; + } + UTF8Iterator& operator++() { return this->operator+=(1); } + UTF8Iterator operator+(size_t v) const { + UTF8Iterator ret(m_it); + ret += v; + return ret; + } + uint32_t operator*() const { + utf8proc_int32_t ret; + utf8proc_iterate(reinterpret_cast(&*m_it), -1, &ret); + return ret; + } + std::string_view::const_iterator iter() const { return m_it; } + size_t countTo(std::string_view::const_iterator end) const { + UTF8Iterator it(m_it); + size_t ret = 0; + while (it.iter() < end && *it != '\0') { + ++ret; + ++it; } + return ret; + } }; #endif #endif -