athena/include/utf8proc.h

294 lines
9.8 KiB
C++

/*
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* @mainpage
*
* utf8proc is a free/open-source (MIT/expat licensed) C library
* providing Unicode normalization, case-folding, and other operations
* for strings in the UTF-8 encoding, supporting Unicode version
* 7.0.0. See the utf8proc home page (http://julialang.org/utf8proc/)
* for downloads and other information, or the source code on github
* (https://github.com/JuliaLang/utf8proc).
*
* For the utf8proc API documentation, see: @ref utf8proc.h
*
* The features of utf8proc include:
*
* - Transformation of strings (@ref utf8proc_map) to:
* - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)
* - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)
* - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK)
* - case-folding (@ref UTF8PROC_CASEFOLD)
* - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC
* - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)
* - Character-width computation: @ref utf8proc_charwidth
* - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string
* - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8.
*/
/** @file */
#ifndef UTF8PROC_H
#define UTF8PROC_H
/** @name API version
*
* The utf8proc API version MAJOR.MINOR.PATCH, following
* semantic-versioning rules (http://semver.org) based on API
* compatibility.
*
* This is also returned at runtime by @ref utf8proc_version; however, the
* runtime version may append a string like "-dev" to the version number
* for prerelease versions.
*
* @note The shared-library version number in the Makefile may be different,
* being based on ABI compatibility rather than API compatibility.
*/
/** @{ */
/** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 1
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
#define UTF8PROC_VERSION_MINOR 3
/** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 0
/** @} */
#include <stdlib.h>
#include <sys/types.h>
#ifdef _MSC_VER
typedef signed char utf8proc_int8_t;
typedef unsigned char utf8proc_uint8_t;
typedef short utf8proc_int16_t;
typedef unsigned short utf8proc_uint16_t;
typedef int utf8proc_int32_t;
typedef unsigned int utf8proc_uint32_t;
# ifdef _WIN64
typedef __int64 utf8proc_ssize_t;
typedef unsigned __int64 utf8proc_size_t;
# else
typedef int utf8proc_ssize_t;
typedef unsigned int utf8proc_size_t;
# endif
# ifndef __cplusplus
typedef unsigned char utf8proc_bool;
enum {false, true};
# else
typedef bool utf8proc_bool;
# endif
#else
# include <stdbool.h>
# include <inttypes.h>
typedef int8_t utf8proc_int8_t;
typedef uint8_t utf8proc_uint8_t;
typedef int16_t utf8proc_int16_t;
typedef uint16_t utf8proc_uint16_t;
typedef int32_t utf8proc_int32_t;
typedef uint32_t utf8proc_uint32_t;
typedef size_t utf8proc_size_t;
typedef ssize_t utf8proc_ssize_t;
typedef bool utf8proc_bool;
#endif
#include <limits.h>
/** @name Error codes
* Error codes being returned by almost all functions.
*/
/** @{ */
/** Memory could not be allocated. */
#define UTF8PROC_ERROR_NOMEM -1
/** The given string is too long to be processed. */
#define UTF8PROC_ERROR_OVERFLOW -2
/** The given string is not a legal UTF-8 string. */
#define UTF8PROC_ERROR_INVALIDUTF8 -3
/** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */
#define UTF8PROC_ERROR_NOTASSIGNED -4
/** Invalid options have been used. */
#define UTF8PROC_ERROR_INVALIDOPTS -5
/** @} */
#define UTF8PROC_cont(ch) (((ch) & 0xc0) == 0x80)
/**
* Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
* The maximum number of bytes read is `strlen`, unless `strlen` is
* negative (in which case up to 4 bytes are read).
*
* If a valid codepoint could be read, it is stored in the variable
* pointed to by `codepoint_ref`, otherwise that variable will be set to -1.
* In case of success, the number of bytes read is returned; otherwise, a
* negative error code is returned.
*/
static inline utf8proc_ssize_t utf8proc_iterate(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
) {
utf8proc_uint32_t uc;
const utf8proc_uint8_t *end;
*dst = -1;
if (!strlen) return 0;
end = str + ((strlen < 0) ? 4 : strlen);
uc = *str++;
if (uc < 0x80) {
*dst = uc;
return 1;
}
// Must be between 0xc2 and 0xf4 inclusive to be valid
if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
if (uc < 0xe0) { // 2-byte sequence
// Must have valid continuation character
if (!UTF8PROC_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
return 2;
}
if (uc < 0xf0) { // 3-byte sequence
if ((str + 1 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]))
return UTF8PROC_ERROR_INVALIDUTF8;
// Check for surrogate chars
if (uc == 0xed && *str > 0x9f)
return UTF8PROC_ERROR_INVALIDUTF8;
uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
if (uc < 0x800)
return UTF8PROC_ERROR_INVALIDUTF8;
*dst = uc;
return 3;
}
// 4-byte sequence
// Must have 3 valid continuation characters
if ((str + 2 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]) || !UTF8PROC_cont(str[2]))
return UTF8PROC_ERROR_INVALIDUTF8;
// Make sure in correct range (0x10000 - 0x10ffff)
if (uc == 0xf0) {
if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
} else if (uc == 0xf4) {
if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
}
*dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
return 4;
}
/**
* Encodes the codepoint as an UTF-8 string in the byte array pointed
* to by `dst`. This array must be at least 4 bytes long.
*
* In case of success the number of bytes written is returned, and
* otherwise 0 is returned.
*
* This function does not check whether `codepoint` is valid Unicode.
*/
static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
if (uc < 0x00) {
return 0;
} else if (uc < 0x80) {
dst[0] = uc;
return 1;
} else if (uc < 0x800) {
dst[0] = 0xC0 + (uc >> 6);
dst[1] = 0x80 + (uc & 0x3F);
return 2;
// Note: we allow encoding 0xd800-0xdfff here, so as not to change
// the API, however, these are actually invalid in UTF-8
} else if (uc < 0x10000) {
dst[0] = 0xE0 + (uc >> 12);
dst[1] = 0x80 + ((uc >> 6) & 0x3F);
dst[2] = 0x80 + (uc & 0x3F);
return 3;
} else if (uc < 0x110000) {
dst[0] = 0xF0 + (uc >> 18);
dst[1] = 0x80 + ((uc >> 12) & 0x3F);
dst[2] = 0x80 + ((uc >> 6) & 0x3F);
dst[3] = 0x80 + (uc & 0x3F);
return 4;
} else return 0;
}
#ifdef __cplusplus
#include <iterator>
#include <string>
class UTF8Iterator : public std::iterator<std::forward_iterator_tag, uint32_t>
{
std::string::const_iterator m_it;
public:
UTF8Iterator(const std::string::const_iterator& it) : m_it(it) {}
UTF8Iterator& operator+=(size_t v)
{
for (size_t i=0 ; i<v ; ++i)
{
utf8proc_int32_t dummy;
utf8proc_ssize_t sz = utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &dummy);
#ifndef NDEBUG
if (*m_it == '\0')
{
fprintf(stderr, "ERROR! UTF8-iterator null-term fail\n");
abort();
}
else if (sz > 0)
m_it += sz;
else
{
fprintf(stderr, "ERROR! UTF8Iterator character fail");
abort();
}
#else
if (sz > 0)
m_it += sz;
#endif
}
return *this;
}
UTF8Iterator& operator++()
{
return this->operator+=(1);
}
UTF8Iterator operator+(size_t v) const
{
UTF8Iterator ret(m_it);
ret += v;
return ret;
}
uint32_t operator*() const
{
utf8proc_int32_t ret;
utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &ret);
return ret;
}
std::string::const_iterator iter() const {return m_it;}
size_t countTo(std::string::const_iterator end) const
{
UTF8Iterator it(m_it);
size_t ret = 0;
while (it.iter() < end && *it != '\0')
{
++ret;
++it;
}
return ret;
}
};
#endif
#endif