Remove deprecated cstdbool from utf8proc

This commit is contained in:
Phillip Stephens 2025-05-01 19:26:15 -07:00
parent 4f3531dd1f
commit b37c3d3dcb

View File

@ -20,8 +20,7 @@
* DEALINGS IN THE SOFTWARE. * DEALINGS IN THE SOFTWARE.
*/ */
/**
/**
* @mainpage * @mainpage
* *
* utf8proc is a free/open-source (MIT/expat licensed) C library * utf8proc is a free/open-source (MIT/expat licensed) C library
@ -36,9 +35,11 @@
* The features of utf8proc include: * The features of utf8proc include:
* *
* - Transformation of strings (@ref utf8proc_map) to: * - Transformation of strings (@ref utf8proc_map) to:
* - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character) * - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters
* (http://en.wikipedia.org/wiki/Combining_character)
* - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT) * - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)
* - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK) * - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining
* characters such as accents (@ref UTF8PROC_STRIPMARK)
* - case-folding (@ref UTF8PROC_CASEFOLD) * - case-folding (@ref UTF8PROC_CASEFOLD)
* - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC * - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC
* - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND) * - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)
@ -53,7 +54,7 @@
#define UTF8PROC_H #define UTF8PROC_H
/** @name API version /** @name API version
* *
* The utf8proc API version MAJOR.MINOR.PATCH, following * The utf8proc API version MAJOR.MINOR.PATCH, following
* semantic-versioning rules (http://semver.org) based on API * semantic-versioning rules (http://semver.org) based on API
* compatibility. * compatibility.
@ -83,26 +84,25 @@ typedef short utf8proc_int16_t;
typedef unsigned short utf8proc_uint16_t; typedef unsigned short utf8proc_uint16_t;
typedef int utf8proc_int32_t; typedef int utf8proc_int32_t;
typedef unsigned int utf8proc_uint32_t; typedef unsigned int utf8proc_uint32_t;
# ifdef _WIN64 #ifdef _WIN64
typedef __int64 utf8proc_ssize_t; typedef __int64 utf8proc_ssize_t;
typedef unsigned __int64 utf8proc_size_t; typedef unsigned __int64 utf8proc_size_t;
# else #else
typedef int utf8proc_ssize_t; typedef int utf8proc_ssize_t;
typedef unsigned int utf8proc_size_t; typedef unsigned int utf8proc_size_t;
# endif #endif
# ifndef __cplusplus #ifndef __cplusplus
typedef unsigned char utf8proc_bool; typedef unsigned char utf8proc_bool;
enum {false, true}; enum { false, true };
# else #else
typedef bool utf8proc_bool; typedef bool utf8proc_bool;
# endif #endif
#else #else
#ifdef __cplusplus #ifdef __cplusplus
# include <cstdbool> #include <cinttypes>
# include <cinttypes>
#else #else
# include <stdbool.h> #include <stdbool.h>
# include <inttypes.h> #include <inttypes.h>
#endif #endif
typedef int8_t utf8proc_int8_t; typedef int8_t utf8proc_int8_t;
typedef uint8_t utf8proc_uint8_t; typedef uint8_t utf8proc_uint8_t;
@ -115,9 +115,9 @@ typedef ssize_t utf8proc_ssize_t;
typedef bool utf8proc_bool; typedef bool utf8proc_bool;
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
# include <climits> #include <climits>
#else #else
# include <limits.h> #include <limits.h>
#endif #endif
/** @name Error codes /** @name Error codes
@ -136,7 +136,7 @@ typedef bool utf8proc_bool;
#define UTF8PROC_ERROR_INVALIDOPTS -5 #define UTF8PROC_ERROR_INVALIDOPTS -5
/** @} */ /** @} */
#define UTF8PROC_cont(ch) (((ch) & 0xc0) == 0x80) #define UTF8PROC_cont(ch) (((ch) & 0xc0) == 0x80)
/** /**
* Reads a single codepoint from the UTF-8 sequence being pointed to by `str`. * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
@ -148,14 +148,14 @@ typedef bool utf8proc_bool;
* In case of success, the number of bytes read is returned; otherwise, a * In case of success, the number of bytes read is returned; otherwise, a
* negative error code is returned. * negative error code is returned.
*/ */
static inline utf8proc_ssize_t utf8proc_iterate( static inline utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t* str, utf8proc_ssize_t strlen,
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst utf8proc_int32_t* dst) {
) {
utf8proc_uint32_t uc; utf8proc_uint32_t uc;
const utf8proc_uint8_t *end; const utf8proc_uint8_t* end;
*dst = -1; *dst = -1;
if (!strlen) return 0; if (!strlen)
return 0;
end = str + ((strlen < 0) ? 4 : strlen); end = str + ((strlen < 0) ? 4 : strlen);
uc = *str++; uc = *str++;
if (uc < 0x80) { if (uc < 0x80) {
@ -163,36 +163,40 @@ static inline utf8proc_ssize_t utf8proc_iterate(
return 1; return 1;
} }
// Must be between 0xc2 and 0xf4 inclusive to be valid // Must be between 0xc2 and 0xf4 inclusive to be valid
if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; if ((uc - 0xc2) > (0xf4 - 0xc2))
if (uc < 0xe0) { // 2-byte sequence return UTF8PROC_ERROR_INVALIDUTF8;
// Must have valid continuation character if (uc < 0xe0) { // 2-byte sequence
if (!UTF8PROC_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; // Must have valid continuation character
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f); if (!UTF8PROC_cont(*str))
return 2; return UTF8PROC_ERROR_INVALIDUTF8;
*dst = ((uc & 0x1f) << 6) | (*str & 0x3f);
return 2;
} }
if (uc < 0xf0) { // 3-byte sequence if (uc < 0xf0) { // 3-byte sequence
if ((str + 1 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1])) if ((str + 1 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]))
return UTF8PROC_ERROR_INVALIDUTF8; return UTF8PROC_ERROR_INVALIDUTF8;
// Check for surrogate chars // Check for surrogate chars
if (uc == 0xed && *str > 0x9f) if (uc == 0xed && *str > 0x9f)
return UTF8PROC_ERROR_INVALIDUTF8; return UTF8PROC_ERROR_INVALIDUTF8;
uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f); uc = ((uc & 0xf) << 12) | ((*str & 0x3f) << 6) | (str[1] & 0x3f);
if (uc < 0x800) if (uc < 0x800)
return UTF8PROC_ERROR_INVALIDUTF8; return UTF8PROC_ERROR_INVALIDUTF8;
*dst = uc; *dst = uc;
return 3; return 3;
} }
// 4-byte sequence // 4-byte sequence
// Must have 3 valid continuation characters // Must have 3 valid continuation characters
if ((str + 2 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]) || !UTF8PROC_cont(str[2])) if ((str + 2 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]) || !UTF8PROC_cont(str[2]))
return UTF8PROC_ERROR_INVALIDUTF8; return UTF8PROC_ERROR_INVALIDUTF8;
// Make sure in correct range (0x10000 - 0x10ffff) // Make sure in correct range (0x10000 - 0x10ffff)
if (uc == 0xf0) { if (uc == 0xf0) {
if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; if (*str < 0x90)
return UTF8PROC_ERROR_INVALIDUTF8;
} else if (uc == 0xf4) { } else if (uc == 0xf4) {
if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; if (*str > 0x8f)
return UTF8PROC_ERROR_INVALIDUTF8;
} }
*dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f); *dst = ((uc & 7) << 18) | ((*str & 0x3f) << 12) | ((str[1] & 0x3f) << 6) | (str[2] & 0x3f);
return 4; return 4;
} }
@ -205,7 +209,7 @@ static inline utf8proc_ssize_t utf8proc_iterate(
* *
* This function does not check whether `codepoint` is valid Unicode. * This function does not check whether `codepoint` is valid Unicode.
*/ */
static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t* dst) {
if (uc < 0x00) { if (uc < 0x00) {
return 0; return 0;
} else if (uc < 0x80) { } else if (uc < 0x80) {
@ -215,8 +219,8 @@ static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8pro
dst[0] = 0xC0 + (uc >> 6); dst[0] = 0xC0 + (uc >> 6);
dst[1] = 0x80 + (uc & 0x3F); dst[1] = 0x80 + (uc & 0x3F);
return 2; return 2;
// Note: we allow encoding 0xd800-0xdfff here, so as not to change // Note: we allow encoding 0xd800-0xdfff here, so as not to change
// the API, however, these are actually invalid in UTF-8 // the API, however, these are actually invalid in UTF-8
} else if (uc < 0x10000) { } else if (uc < 0x10000) {
dst[0] = 0xE0 + (uc >> 12); dst[0] = 0xE0 + (uc >> 12);
dst[1] = 0x80 + ((uc >> 6) & 0x3F); dst[1] = 0x80 + ((uc >> 6) & 0x3F);
@ -228,81 +232,69 @@ static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8pro
dst[2] = 0x80 + ((uc >> 6) & 0x3F); dst[2] = 0x80 + ((uc >> 6) & 0x3F);
dst[3] = 0x80 + (uc & 0x3F); dst[3] = 0x80 + (uc & 0x3F);
return 4; return 4;
} else return 0; } else
return 0;
} }
#ifdef __cplusplus #ifdef __cplusplus
#include <iterator> #include <iterator>
#include <string> #include <string>
class UTF8Iterator class UTF8Iterator {
{ std::string_view::const_iterator m_it;
std::string_view::const_iterator m_it;
public:
using iterator_category = std::forward_iterator_tag;
using value_type = uint32_t;
using difference_type = std::ptrdiff_t;
using pointer = uint32_t*;
using reference = uint32_t&;
UTF8Iterator(const std::string_view::const_iterator& it) : m_it(it) {} public:
UTF8Iterator& operator+=(size_t v) using iterator_category = std::forward_iterator_tag;
{ using value_type = uint32_t;
for (size_t i=0 ; i<v ; ++i) using difference_type = std::ptrdiff_t;
{ using pointer = uint32_t*;
utf8proc_int32_t dummy; using reference = uint32_t&;
utf8proc_ssize_t sz = utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &dummy);
UTF8Iterator(const std::string_view::const_iterator& it) : m_it(it) {}
UTF8Iterator& operator+=(size_t v) {
for (size_t i = 0; i < v; ++i) {
utf8proc_int32_t dummy;
utf8proc_ssize_t sz = utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &dummy);
#ifndef NDEBUG #ifndef NDEBUG
if (*m_it == '\0') if (*m_it == '\0') {
{ fprintf(stderr, "ERROR! UTF8-iterator null-term fail\n");
fprintf(stderr, "ERROR! UTF8-iterator null-term fail\n"); abort();
abort(); } else if (sz > 0)
} m_it += sz;
else if (sz > 0) else {
m_it += sz; fprintf(stderr, "ERROR! UTF8Iterator character fail");
else abort();
{ }
fprintf(stderr, "ERROR! UTF8Iterator character fail");
abort();
}
#else #else
if (sz > 0) if (sz > 0)
m_it += sz; m_it += sz;
#endif #endif
}
return *this;
} }
UTF8Iterator& operator++() return *this;
{ }
return this->operator+=(1); UTF8Iterator& operator++() { return this->operator+=(1); }
} UTF8Iterator operator+(size_t v) const {
UTF8Iterator operator+(size_t v) const UTF8Iterator ret(m_it);
{ ret += v;
UTF8Iterator ret(m_it); return ret;
ret += v; }
return ret; uint32_t operator*() const {
} utf8proc_int32_t ret;
uint32_t operator*() const utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &ret);
{ return ret;
utf8proc_int32_t ret; }
utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &ret); std::string_view::const_iterator iter() const { return m_it; }
return ret; size_t countTo(std::string_view::const_iterator end) const {
} UTF8Iterator it(m_it);
std::string_view::const_iterator iter() const {return m_it;} size_t ret = 0;
size_t countTo(std::string_view::const_iterator end) const while (it.iter() < end && *it != '\0') {
{ ++ret;
UTF8Iterator it(m_it); ++it;
size_t ret = 0;
while (it.iter() < end && *it != '\0')
{
++ret;
++it;
}
return ret;
} }
return ret;
}
}; };
#endif #endif
#endif #endif