mirror of
				https://github.com/encounter/aurora.git
				synced 2025-10-27 12:20:22 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			294 lines
		
	
	
		
			9.8 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			294 lines
		
	
	
		
			9.8 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
 | |
|  *
 | |
|  * Permission is hereby granted, free of charge, to any person obtaining a
 | |
|  * copy of this software and associated documentation files (the "Software"),
 | |
|  * to deal in the Software without restriction, including without limitation
 | |
|  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 | |
|  * and/or sell copies of the Software, and to permit persons to whom the
 | |
|  * Software is furnished to do so, subject to the following conditions:
 | |
|  *
 | |
|  * The above copyright notice and this permission notice shall be included in
 | |
|  * all copies or substantial portions of the Software.
 | |
|  *
 | |
|  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | |
|  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | |
|  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | |
|  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | |
|  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 | |
|  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 | |
|  * DEALINGS IN THE SOFTWARE.
 | |
|  */
 | |
| 
 | |
| 
 | |
| /**
 | |
|  * @mainpage
 | |
|  *
 | |
|  * utf8proc is a free/open-source (MIT/expat licensed) C library
 | |
|  * providing Unicode normalization, case-folding, and other operations
 | |
|  * for strings in the UTF-8 encoding, supporting Unicode version
 | |
|  * 7.0.0.  See the utf8proc home page (http://julialang.org/utf8proc/)
 | |
|  * for downloads and other information, or the source code on github
 | |
|  * (https://github.com/JuliaLang/utf8proc).
 | |
|  *
 | |
|  * For the utf8proc API documentation, see: @ref utf8proc.h
 | |
|  *
 | |
|  * The features of utf8proc include:
 | |
|  *
 | |
|  * - Transformation of strings (@ref utf8proc_map) to:
 | |
|  *    - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)
 | |
|  *    - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)
 | |
|  *    - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK)
 | |
|  *    - case-folding (@ref UTF8PROC_CASEFOLD)
 | |
|  * - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC
 | |
|  * - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)
 | |
|  * - Character-width computation: @ref utf8proc_charwidth
 | |
|  * - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string
 | |
|  * - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8.
 | |
|  */
 | |
| 
 | |
| /** @file */
 | |
| 
 | |
| #ifndef UTF8PROC_H
 | |
| #define UTF8PROC_H
 | |
| 
 | |
| /** @name API version
 | |
|  *
 | |
|  * The utf8proc API version MAJOR.MINOR.PATCH, following
 | |
|  * semantic-versioning rules (http://semver.org) based on API
 | |
|  * compatibility.
 | |
|  *
 | |
|  * This is also returned at runtime by @ref utf8proc_version; however, the
 | |
|  * runtime version may append a string like "-dev" to the version number
 | |
|  * for prerelease versions.
 | |
|  *
 | |
|  * @note The shared-library version number in the Makefile may be different,
 | |
|  *       being based on ABI compatibility rather than API compatibility.
 | |
|  */
 | |
| /** @{ */
 | |
| /** The MAJOR version number (increased when backwards API compatibility is broken). */
 | |
| #define UTF8PROC_VERSION_MAJOR 1
 | |
| /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
 | |
| #define UTF8PROC_VERSION_MINOR 3
 | |
| /** The PATCH version (increased for fixes that do not change the API). */
 | |
| #define UTF8PROC_VERSION_PATCH 0
 | |
| /** @} */
 | |
| 
 | |
| #include <stdlib.h>
 | |
| #include <sys/types.h>
 | |
| #ifdef _MSC_VER
 | |
| typedef signed char utf8proc_int8_t;
 | |
| typedef unsigned char utf8proc_uint8_t;
 | |
| typedef short utf8proc_int16_t;
 | |
| typedef unsigned short utf8proc_uint16_t;
 | |
| typedef int utf8proc_int32_t;
 | |
| typedef unsigned int utf8proc_uint32_t;
 | |
| #  ifdef _WIN64
 | |
| typedef __int64 utf8proc_ssize_t;
 | |
| typedef unsigned __int64 utf8proc_size_t;
 | |
| #  else
 | |
| typedef int utf8proc_ssize_t;
 | |
| typedef unsigned int utf8proc_size_t;
 | |
| #  endif
 | |
| #  ifndef __cplusplus
 | |
| typedef unsigned char utf8proc_bool;
 | |
| enum {false, true};
 | |
| #  else
 | |
| typedef bool utf8proc_bool;
 | |
| #  endif
 | |
| #else
 | |
| #  include <stdbool.h>
 | |
| #  include <inttypes.h>
 | |
| typedef int8_t utf8proc_int8_t;
 | |
| typedef uint8_t utf8proc_uint8_t;
 | |
| typedef int16_t utf8proc_int16_t;
 | |
| typedef uint16_t utf8proc_uint16_t;
 | |
| typedef int32_t utf8proc_int32_t;
 | |
| typedef uint32_t utf8proc_uint32_t;
 | |
| typedef size_t utf8proc_size_t;
 | |
| typedef ssize_t utf8proc_ssize_t;
 | |
| typedef bool utf8proc_bool;
 | |
| #endif
 | |
| #include <limits.h>
 | |
| 
 | |
| /** @name Error codes
 | |
|  * Error codes being returned by almost all functions.
 | |
|  */
 | |
| /** @{ */
 | |
| /** Memory could not be allocated. */
 | |
| #define UTF8PROC_ERROR_NOMEM -1
 | |
| /** The given string is too long to be processed. */
 | |
| #define UTF8PROC_ERROR_OVERFLOW -2
 | |
| /** The given string is not a legal UTF-8 string. */
 | |
| #define UTF8PROC_ERROR_INVALIDUTF8 -3
 | |
| /** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */
 | |
| #define UTF8PROC_ERROR_NOTASSIGNED -4
 | |
| /** Invalid options have been used. */
 | |
| #define UTF8PROC_ERROR_INVALIDOPTS -5
 | |
| /** @} */
 | |
| 
 | |
| #define UTF8PROC_cont(ch)  (((ch) & 0xc0) == 0x80)
 | |
| 
 | |
| /**
 | |
|  * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
 | |
|  * The maximum number of bytes read is `strlen`, unless `strlen` is
 | |
|  * negative (in which case up to 4 bytes are read).
 | |
|  *
 | |
|  * If a valid codepoint could be read, it is stored in the variable
 | |
|  * pointed to by `codepoint_ref`, otherwise that variable will be set to -1.
 | |
|  * In case of success, the number of bytes read is returned; otherwise, a
 | |
|  * negative error code is returned.
 | |
|  */
 | |
| static inline utf8proc_ssize_t utf8proc_iterate(
 | |
|   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
 | |
| ) {
 | |
|   utf8proc_uint32_t uc;
 | |
|   const utf8proc_uint8_t *end;
 | |
| 
 | |
|   *dst = -1;
 | |
|   if (!strlen) return 0;
 | |
|   end = str + ((strlen < 0) ? 4 : strlen);
 | |
|   uc = *str++;
 | |
|   if (uc < 0x80) {
 | |
|     *dst = uc;
 | |
|     return 1;
 | |
|   }
 | |
|   // Must be between 0xc2 and 0xf4 inclusive to be valid
 | |
|   if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
 | |
|   if (uc < 0xe0) {         // 2-byte sequence
 | |
|      // Must have valid continuation character
 | |
|      if (!UTF8PROC_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
 | |
|      *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
 | |
|      return 2;
 | |
|   }
 | |
|   if (uc < 0xf0) {        // 3-byte sequence
 | |
|      if ((str + 1 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]))
 | |
|         return UTF8PROC_ERROR_INVALIDUTF8;
 | |
|      // Check for surrogate chars
 | |
|      if (uc == 0xed && *str > 0x9f)
 | |
|          return UTF8PROC_ERROR_INVALIDUTF8;
 | |
|      uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
 | |
|      if (uc < 0x800)
 | |
|          return UTF8PROC_ERROR_INVALIDUTF8;
 | |
|      *dst = uc;
 | |
|      return 3;
 | |
|   }
 | |
|   // 4-byte sequence
 | |
|   // Must have 3 valid continuation characters
 | |
|   if ((str + 2 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]) || !UTF8PROC_cont(str[2]))
 | |
|      return UTF8PROC_ERROR_INVALIDUTF8;
 | |
|   // Make sure in correct range (0x10000 - 0x10ffff)
 | |
|   if (uc == 0xf0) {
 | |
|     if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
 | |
|   } else if (uc == 0xf4) {
 | |
|     if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
 | |
|   }
 | |
|   *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
 | |
|   return 4;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Encodes the codepoint as an UTF-8 string in the byte array pointed
 | |
|  * to by `dst`. This array must be at least 4 bytes long.
 | |
|  *
 | |
|  * In case of success the number of bytes written is returned, and
 | |
|  * otherwise 0 is returned.
 | |
|  *
 | |
|  * This function does not check whether `codepoint` is valid Unicode.
 | |
|  */
 | |
| static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
 | |
|   if (uc < 0x00) {
 | |
|     return 0;
 | |
|   } else if (uc < 0x80) {
 | |
|     dst[0] = uc;
 | |
|     return 1;
 | |
|   } else if (uc < 0x800) {
 | |
|     dst[0] = 0xC0 + (uc >> 6);
 | |
|     dst[1] = 0x80 + (uc & 0x3F);
 | |
|     return 2;
 | |
|   // Note: we allow encoding 0xd800-0xdfff here, so as not to change
 | |
|   // the API, however, these are actually invalid in UTF-8
 | |
|   } else if (uc < 0x10000) {
 | |
|     dst[0] = 0xE0 + (uc >> 12);
 | |
|     dst[1] = 0x80 + ((uc >> 6) & 0x3F);
 | |
|     dst[2] = 0x80 + (uc & 0x3F);
 | |
|     return 3;
 | |
|   } else if (uc < 0x110000) {
 | |
|     dst[0] = 0xF0 + (uc >> 18);
 | |
|     dst[1] = 0x80 + ((uc >> 12) & 0x3F);
 | |
|     dst[2] = 0x80 + ((uc >> 6) & 0x3F);
 | |
|     dst[3] = 0x80 + (uc & 0x3F);
 | |
|     return 4;
 | |
|   } else return 0;
 | |
| }
 | |
| 
 | |
| #ifdef __cplusplus
 | |
| #include <iterator>
 | |
| #include <string>
 | |
| 
 | |
| class UTF8Iterator : public std::iterator<std::forward_iterator_tag, uint32_t>
 | |
| {
 | |
|     std::string::const_iterator m_it;
 | |
| public:
 | |
|     UTF8Iterator(const std::string::const_iterator& it) : m_it(it) {}
 | |
|     UTF8Iterator& operator+=(size_t v)
 | |
|     {
 | |
|         for (size_t i=0 ; i<v ; ++i)
 | |
|         {
 | |
|             utf8proc_int32_t dummy;
 | |
|             utf8proc_ssize_t sz = utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &dummy);
 | |
| #ifndef NDEBUG
 | |
|             if (*m_it == '\0')
 | |
|             {
 | |
|                 fprintf(stderr, "ERROR! UTF8-iterator null-term fail\n");
 | |
|                 abort();
 | |
|             }
 | |
|             else if (sz > 0)
 | |
|                 m_it += sz;
 | |
|             else
 | |
|             {
 | |
|                 fprintf(stderr, "ERROR! UTF8Iterator character fail");
 | |
|                 abort();
 | |
|             }
 | |
| #else
 | |
|             if (sz > 0)
 | |
|                 m_it += sz;
 | |
| #endif
 | |
|         }
 | |
|         return *this;
 | |
|     }
 | |
|     UTF8Iterator& operator++()
 | |
|     {
 | |
|         return this->operator+=(1);
 | |
|     }
 | |
|     UTF8Iterator operator+(size_t v) const
 | |
|     {
 | |
|         UTF8Iterator ret(m_it);
 | |
|         ret += v;
 | |
|         return ret;
 | |
|     }
 | |
|     uint32_t operator*() const
 | |
|     {
 | |
|         utf8proc_int32_t ret;
 | |
|         utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &ret);
 | |
|         return ret;
 | |
|     }
 | |
|     std::string::const_iterator iter() const {return m_it;}
 | |
|     size_t countTo(std::string::const_iterator end) const
 | |
|     {
 | |
|         UTF8Iterator it(m_it);
 | |
|         size_t ret = 0;
 | |
|         while (it.iter() < end)
 | |
|         {
 | |
|             ++ret;
 | |
|             ++it;
 | |
|         }
 | |
|         return ret;
 | |
|     }
 | |
| };
 | |
| 
 | |
| #endif
 | |
| 
 | |
| #endif
 | |
| 
 |