athena/include/utf8proc.h

/*
 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */


/** 
 * @mainpage
 *
 * utf8proc is a free/open-source (MIT/expat licensed) C library
 * providing Unicode normalization, case-folding, and other operations
 * for strings in the UTF-8 encoding, supporting Unicode version
 * 7.0.0.  See the utf8proc home page (http://julialang.org/utf8proc/)
 * for downloads and other information, or the source code on github
 * (https://github.com/JuliaLang/utf8proc).
 *
 * For the utf8proc API documentation, see: @ref utf8proc.h
 *
 * The features of utf8proc include:
 *
 * - Transformation of strings (@ref utf8proc_map) to:
 *    - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)
 *    - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)
 *    - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK)
 *    - case-folding (@ref UTF8PROC_CASEFOLD)
 * - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC
 * - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)
 * - Character-width computation: @ref utf8proc_charwidth
 * - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string
 * - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8.
 */

/** @file */

#ifndef UTF8PROC_H
#define UTF8PROC_H

/** @name API version
 *  
 * The utf8proc API version MAJOR.MINOR.PATCH, following
 * semantic-versioning rules (http://semver.org) based on API
 * compatibility.
 *
 * This is also returned at runtime by @ref utf8proc_version; however, the
 * runtime version may append a string like "-dev" to the version number
 * for prerelease versions.
 *
 * @note The shared-library version number in the Makefile may be different,
 *       being based on ABI compatibility rather than API compatibility.
 */
/** @{ */
/** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 1
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
#define UTF8PROC_VERSION_MINOR 3
/** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 0
/** @} */

#include <stdlib.h>
#include <sys/types.h>
#ifdef _MSC_VER
typedef signed char utf8proc_int8_t;
typedef unsigned char utf8proc_uint8_t;
typedef short utf8proc_int16_t;
typedef unsigned short utf8proc_uint16_t;
typedef int utf8proc_int32_t;
typedef unsigned int utf8proc_uint32_t;
#  ifdef _WIN64
typedef __int64 utf8proc_ssize_t;
typedef unsigned __int64 utf8proc_size_t;
#  else
typedef int utf8proc_ssize_t;
typedef unsigned int utf8proc_size_t;
#  endif
#  ifndef __cplusplus
typedef unsigned char utf8proc_bool;
enum {false, true};
#  else
typedef bool utf8proc_bool;
#  endif
#else
#ifdef __cplusplus
#  include <cstdbool>
#  include <cinttypes>
#else
#  include <stdbool.h>
#  include <inttypes.h>
#endif
typedef int8_t utf8proc_int8_t;
typedef uint8_t utf8proc_uint8_t;
typedef int16_t utf8proc_int16_t;
typedef uint16_t utf8proc_uint16_t;
typedef int32_t utf8proc_int32_t;
typedef uint32_t utf8proc_uint32_t;
typedef size_t utf8proc_size_t;
typedef ssize_t utf8proc_ssize_t;
typedef bool utf8proc_bool;
#endif
#ifdef __cplusplus
#  include <climits>
#else
#  include <limits.h>
#endif

/** @name Error codes
 * Error codes being returned by almost all functions.
 */
/** @{ */
/** Memory could not be allocated. */
#define UTF8PROC_ERROR_NOMEM -1
/** The given string is too long to be processed. */
#define UTF8PROC_ERROR_OVERFLOW -2
/** The given string is not a legal UTF-8 string. */
#define UTF8PROC_ERROR_INVALIDUTF8 -3
/** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */
#define UTF8PROC_ERROR_NOTASSIGNED -4
/** Invalid options have been used. */
#define UTF8PROC_ERROR_INVALIDOPTS -5
/** @} */

#define UTF8PROC_cont(ch)  (((ch) & 0xc0) == 0x80)

/**
 * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
 * The maximum number of bytes read is `strlen`, unless `strlen` is
 * negative (in which case up to 4 bytes are read).
 *
 * If a valid codepoint could be read, it is stored in the variable
 * pointed to by `codepoint_ref`, otherwise that variable will be set to -1.
 * In case of success, the number of bytes read is returned; otherwise, a
 * negative error code is returned.
 */
static inline utf8proc_ssize_t utf8proc_iterate(
  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
) {
  utf8proc_uint32_t uc;
  const utf8proc_uint8_t *end;

  *dst = -1;
  if (!strlen) return 0;
  end = str + ((strlen < 0) ? 4 : strlen);
  uc = *str++;
  if (uc < 0x80) {
    *dst = uc;
    return 1;
  }
  // Must be between 0xc2 and 0xf4 inclusive to be valid
  if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
  if (uc < 0xe0) {         // 2-byte sequence
     // Must have valid continuation character
     if (!UTF8PROC_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
     *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
     return 2;
  }
  if (uc < 0xf0) {        // 3-byte sequence
     if ((str + 1 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]))
        return UTF8PROC_ERROR_INVALIDUTF8;
     // Check for surrogate chars
     if (uc == 0xed && *str > 0x9f)
         return UTF8PROC_ERROR_INVALIDUTF8;
     uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
     if (uc < 0x800)
         return UTF8PROC_ERROR_INVALIDUTF8;
     *dst = uc;
     return 3;
  }
  // 4-byte sequence
  // Must have 3 valid continuation characters
  if ((str + 2 >= end) || !UTF8PROC_cont(*str) || !UTF8PROC_cont(str[1]) || !UTF8PROC_cont(str[2]))
     return UTF8PROC_ERROR_INVALIDUTF8;
  // Make sure in correct range (0x10000 - 0x10ffff)
  if (uc == 0xf0) {
    if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
  } else if (uc == 0xf4) {
    if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
  }
  *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
  return 4;
}

/**
 * Encodes the codepoint as an UTF-8 string in the byte array pointed
 * to by `dst`. This array must be at least 4 bytes long.
 *
 * In case of success the number of bytes written is returned, and
 * otherwise 0 is returned.
 *
 * This function does not check whether `codepoint` is valid Unicode.
 */
static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
  if (uc < 0x00) {
    return 0;
  } else if (uc < 0x80) {
    dst[0] = uc;
    return 1;
  } else if (uc < 0x800) {
    dst[0] = 0xC0 + (uc >> 6);
    dst[1] = 0x80 + (uc & 0x3F);
    return 2;
  // Note: we allow encoding 0xd800-0xdfff here, so as not to change
  // the API, however, these are actually invalid in UTF-8
  } else if (uc < 0x10000) {
    dst[0] = 0xE0 + (uc >> 12);
    dst[1] = 0x80 + ((uc >> 6) & 0x3F);
    dst[2] = 0x80 + (uc & 0x3F);
    return 3;
  } else if (uc < 0x110000) {
    dst[0] = 0xF0 + (uc >> 18);
    dst[1] = 0x80 + ((uc >> 12) & 0x3F);
    dst[2] = 0x80 + ((uc >> 6) & 0x3F);
    dst[3] = 0x80 + (uc & 0x3F);
    return 4;
  } else return 0;
}

#ifdef __cplusplus
#include <iterator>
#include <string>

class UTF8Iterator
{
    std::string_view::const_iterator m_it;
public:
    using iterator_category = std::forward_iterator_tag;
    using value_type = uint32_t;
    using difference_type = std::ptrdiff_t;
    using pointer = uint32_t*;
    using reference = uint32_t&;

    UTF8Iterator(const std::string_view::const_iterator& it) : m_it(it) {}
    UTF8Iterator& operator+=(size_t v)
    {
        for (size_t i=0 ; i<v ; ++i)
        {
            utf8proc_int32_t dummy;
            utf8proc_ssize_t sz = utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &dummy);
#ifndef NDEBUG
            if (*m_it == '\0')
            {
                fprintf(stderr, "ERROR! UTF8-iterator null-term fail\n");
                abort();
            }
            else if (sz > 0)
                m_it += sz;
            else
            {
                fprintf(stderr, "ERROR! UTF8Iterator character fail");
                abort();
            }
#else
            if (sz > 0)
                m_it += sz;
#endif
        }
        return *this;
    }
    UTF8Iterator& operator++()
    {
        return this->operator+=(1);
    }
    UTF8Iterator operator+(size_t v) const
    {
        UTF8Iterator ret(m_it);
        ret += v;
        return ret;
    }
    uint32_t operator*() const
    {
        utf8proc_int32_t ret;
        utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(&*m_it), -1, &ret);
        return ret;
    }
    std::string_view::const_iterator iter() const {return m_it;}
    size_t countTo(std::string_view::const_iterator end) const
    {
        UTF8Iterator it(m_it);
        size_t ret = 0;
        while (it.iter() < end && *it != '\0')
        {
            ++ret;
            ++it;
        }
        return ret;
    }
};

#endif

#endif
stripped-down utf8proc added 2015-08-31 21:11:42 +00:00			`/*`
			`* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a`
			`* copy of this software and associated documentation files (the "Software"),`
			`* to deal in the Software without restriction, including without limitation`
			`* the rights to use, copy, modify, merge, publish, distribute, sublicense,`
			`* and/or sell copies of the Software, and to permit persons to whom the`
			`* Software is furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included in`
			`* all copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING`
			`* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER`
			`* DEALINGS IN THE SOFTWARE.`
			`*/`


			`/**`
			`* @mainpage`
			`*`
			`* utf8proc is a free/open-source (MIT/expat licensed) C library`
			`* providing Unicode normalization, case-folding, and other operations`
			`* for strings in the UTF-8 encoding, supporting Unicode version`
			`* 7.0.0. See the utf8proc home page (http://julialang.org/utf8proc/)`
			`* for downloads and other information, or the source code on github`
			`* (https://github.com/JuliaLang/utf8proc).`
			`*`
			`* For the utf8proc API documentation, see: @ref utf8proc.h`
			`*`
			`* The features of utf8proc include:`
			`*`
			`* - Transformation of strings (@ref utf8proc_map) to:`
			`* - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)`
			`* - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)`
			`* - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK)`
			`* - case-folding (@ref UTF8PROC_CASEFOLD)`
			`* - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC`
			`* - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)`
			`* - Character-width computation: @ref utf8proc_charwidth`
			`* - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string`
			`* - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8.`
			`*/`

			`/** @file */`

			`#ifndef UTF8PROC_H`
			`#define UTF8PROC_H`

			`/** @name API version`
			`*`
			`* The utf8proc API version MAJOR.MINOR.PATCH, following`
			`* semantic-versioning rules (http://semver.org) based on API`
			`* compatibility.`
			`*`
			`* This is also returned at runtime by @ref utf8proc_version; however, the`
			`* runtime version may append a string like "-dev" to the version number`
			`* for prerelease versions.`
			`*`
			`* @note The shared-library version number in the Makefile may be different,`
			`* being based on ABI compatibility rather than API compatibility.`
			`*/`
			`/** @{ */`
			`/** The MAJOR version number (increased when backwards API compatibility is broken). */`
			`#define UTF8PROC_VERSION_MAJOR 1`
			`/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */`
			`#define UTF8PROC_VERSION_MINOR 3`
			`/** The PATCH version (increased for fixes that do not change the API). */`
			`#define UTF8PROC_VERSION_PATCH 0`
			`/** @} */`

			`#include <stdlib.h>`
			`#include <sys/types.h>`
			`#ifdef _MSC_VER`
			`typedef signed char utf8proc_int8_t;`
			`typedef unsigned char utf8proc_uint8_t;`
			`typedef short utf8proc_int16_t;`
			`typedef unsigned short utf8proc_uint16_t;`
			`typedef int utf8proc_int32_t;`
			`typedef unsigned int utf8proc_uint32_t;`
			`# ifdef _WIN64`
			`typedef __int64 utf8proc_ssize_t;`
			`typedef unsigned __int64 utf8proc_size_t;`
			`# else`
			`typedef int utf8proc_ssize_t;`
			`typedef unsigned int utf8proc_size_t;`
			`# endif`
			`# ifndef __cplusplus`
			`typedef unsigned char utf8proc_bool;`
			`enum {false, true};`
			`# else`
			`typedef bool utf8proc_bool;`
			`# endif`
			`#else`
Huge compile performance refactor 2017-12-29 07:55:42 +00:00			`#ifdef __cplusplus`
			`# include <cstdbool>`
			`# include <cinttypes>`
			`#else`
stripped-down utf8proc added 2015-08-31 21:11:42 +00:00			`# include <stdbool.h>`
			`# include <inttypes.h>`
Huge compile performance refactor 2017-12-29 07:55:42 +00:00			`#endif`
stripped-down utf8proc added 2015-08-31 21:11:42 +00:00			`typedef int8_t utf8proc_int8_t;`
			`typedef uint8_t utf8proc_uint8_t;`
			`typedef int16_t utf8proc_int16_t;`
			`typedef uint16_t utf8proc_uint16_t;`
			`typedef int32_t utf8proc_int32_t;`
			`typedef uint32_t utf8proc_uint32_t;`
			`typedef size_t utf8proc_size_t;`
			`typedef ssize_t utf8proc_ssize_t;`
			`typedef bool utf8proc_bool;`
			`#endif`
Huge compile performance refactor 2017-12-29 07:55:42 +00:00			`#ifdef __cplusplus`
			`# include <climits>`
			`#else`
			`# include <limits.h>`
			`#endif`
stripped-down utf8proc added 2015-08-31 21:11:42 +00:00
			`/** @name Error codes`
			`* Error codes being returned by almost all functions.`
			`*/`
			`/** @{ */`
			`/** Memory could not be allocated. */`
			`#define UTF8PROC_ERROR_NOMEM -1`
			`/** The given string is too long to be processed. */`
			`#define UTF8PROC_ERROR_OVERFLOW -2`
			`/** The given string is not a legal UTF-8 string. */`
			`#define UTF8PROC_ERROR_INVALIDUTF8 -3`
			`/** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */`
			`#define UTF8PROC_ERROR_NOTASSIGNED -4`
			`/** Invalid options have been used. */`
			`#define UTF8PROC_ERROR_INVALIDOPTS -5`
			`/** @} */`

			`#define UTF8PROC_cont(ch) (((ch) & 0xc0) == 0x80)`

			`/**`
			* Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
			* The maximum number of bytes read is `strlen`, unless `strlen` is
			`* negative (in which case up to 4 bytes are read).`
			`*`
			`* If a valid codepoint could be read, it is stored in the variable`
			* pointed to by `codepoint_ref`, otherwise that variable will be set to -1.
			`* In case of success, the number of bytes read is returned; otherwise, a`
			`* negative error code is returned.`
			`*/`
			`static inline utf8proc_ssize_t utf8proc_iterate(`
			`const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_int32_t dst`
			`) {`
			`utf8proc_uint32_t uc;`
			`const utf8proc_uint8_t *end;`

			`*dst = -1;`
			`if (!strlen) return 0;`
			`end = str + ((strlen < 0) ? 4 : strlen);`
			`uc = *str++;`
			`if (uc < 0x80) {`
			`*dst = uc;`
			`return 1;`
			`}`
			`// Must be between 0xc2 and 0xf4 inclusive to be valid`
			`if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;`
			`if (uc < 0xe0) { // 2-byte sequence`
			`// Must have valid continuation character`
			`if (!UTF8PROC_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;`
			`dst = ((uc & 0x1f)<<6) \| (str & 0x3f);`
			`return 2;`
			`}`
			`if (uc < 0xf0) { // 3-byte sequence`
			`if ((str + 1 >= end) \|\| !UTF8PROC_cont(*str) \|\| !UTF8PROC_cont(str[1]))`
			`return UTF8PROC_ERROR_INVALIDUTF8;`
			`// Check for surrogate chars`
			`if (uc == 0xed && *str > 0x9f)`
			`return UTF8PROC_ERROR_INVALIDUTF8;`
			`uc = ((uc & 0xf)<<12) \| ((*str & 0x3f)<<6) \| (str[1] & 0x3f);`
			`if (uc < 0x800)`
			`return UTF8PROC_ERROR_INVALIDUTF8;`
			`*dst = uc;`
			`return 3;`
			`}`
			`// 4-byte sequence`
			`// Must have 3 valid continuation characters`
			`if ((str + 2 >= end) \|\| !UTF8PROC_cont(*str) \|\| !UTF8PROC_cont(str[1]) \|\| !UTF8PROC_cont(str[2]))`
			`return UTF8PROC_ERROR_INVALIDUTF8;`
			`// Make sure in correct range (0x10000 - 0x10ffff)`
			`if (uc == 0xf0) {`
			`if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;`
			`} else if (uc == 0xf4) {`
			`if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;`
			`}`
			`dst = ((uc & 7)<<18) \| ((str & 0x3f)<<12) \| ((str[1] & 0x3f)<<6) \| (str[2] & 0x3f);`
			`return 4;`
			`}`

			`/**`
			`* Encodes the codepoint as an UTF-8 string in the byte array pointed`
			* to by `dst`. This array must be at least 4 bytes long.
			`*`
			`* In case of success the number of bytes written is returned, and`
			`* otherwise 0 is returned.`
			`*`
			* This function does not check whether `codepoint` is valid Unicode.
			`*/`
			`static inline utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {`
			`if (uc < 0x00) {`
			`return 0;`
			`} else if (uc < 0x80) {`
			`dst[0] = uc;`
			`return 1;`
			`} else if (uc < 0x800) {`
			`dst[0] = 0xC0 + (uc >> 6);`
			`dst[1] = 0x80 + (uc & 0x3F);`
			`return 2;`
			`// Note: we allow encoding 0xd800-0xdfff here, so as not to change`
			`// the API, however, these are actually invalid in UTF-8`
			`} else if (uc < 0x10000) {`
			`dst[0] = 0xE0 + (uc >> 12);`
			`dst[1] = 0x80 + ((uc >> 6) & 0x3F);`
			`dst[2] = 0x80 + (uc & 0x3F);`
			`return 3;`
			`} else if (uc < 0x110000) {`
			`dst[0] = 0xF0 + (uc >> 18);`
			`dst[1] = 0x80 + ((uc >> 12) & 0x3F);`
			`dst[2] = 0x80 + ((uc >> 6) & 0x3F);`
			`dst[3] = 0x80 + (uc & 0x3F);`
			`return 4;`
			`} else return 0;`
			`}`

Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00			`#ifdef __cplusplus`
			`#include <iterator>`
Add <string> header 2015-12-20 21:17:12 +00:00			`#include <string>`
Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00
Add cross-compiling support for atdna 2017-12-07 04:07:33 +00:00			`class UTF8Iterator`
Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00			`{`
string_view refactor 2017-11-13 06:12:37 +00:00			`std::string_view::const_iterator m_it;`
Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00			`public:`
Add cross-compiling support for atdna 2017-12-07 04:07:33 +00:00			`using iterator_category = std::forward_iterator_tag;`
			`using value_type = uint32_t;`
			`using difference_type = std::ptrdiff_t;`
			`using pointer = uint32_t*;`
			`using reference = uint32_t&;`

string_view refactor 2017-11-13 06:12:37 +00:00			`UTF8Iterator(const std::string_view::const_iterator& it) : m_it(it) {}`
Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00			`UTF8Iterator& operator+=(size_t v)`
			`{`
			`for (size_t i=0 ; i<v ; ++i)`
			`{`
			`utf8proc_int32_t dummy;`
More portable iterator->pointer 2015-12-20 22:51:55 +00:00			`utf8proc_ssize_t sz = utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t>(&m_it), -1, &dummy);`
Update utf8proc 2015-12-27 22:38:37 +00:00			`#ifndef NDEBUG`
			`if (*m_it == '\0')`
			`{`
			`fprintf(stderr, "ERROR! UTF8-iterator null-term fail\n");`
			`abort();`
			`}`
			`else if (sz > 0)`
			`m_it += sz;`
			`else`
			`{`
			`fprintf(stderr, "ERROR! UTF8Iterator character fail");`
			`abort();`
			`}`
			`#else`
Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00			`if (sz > 0)`
			`m_it += sz;`
Update utf8proc 2015-12-27 22:38:37 +00:00			`#endif`
Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00			`}`
			`return *this;`
			`}`
Add increment operator 2015-12-20 21:41:10 +00:00			`UTF8Iterator& operator++()`
			`{`
			`return this->operator+=(1);`
			`}`
Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00			`UTF8Iterator operator+(size_t v) const`
			`{`
			`UTF8Iterator ret(m_it);`
			`ret += v;`
			`return ret;`
			`}`
			`uint32_t operator*() const`
			`{`
			`utf8proc_int32_t ret;`
More portable iterator->pointer 2015-12-20 22:51:55 +00:00			`utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t>(&m_it), -1, &ret);`
Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00			`return ret;`
			`}`
string_view refactor 2017-11-13 06:12:37 +00:00			`std::string_view::const_iterator iter() const {return m_it;}`
			`size_t countTo(std::string_view::const_iterator end) const`
Update utf8proc 2015-12-27 22:38:37 +00:00			`{`
			`UTF8Iterator it(m_it);`
			`size_t ret = 0;`
utf8proc null-pointer handling 2016-12-11 06:19:22 +00:00			`while (it.iter() < end && *it != '\0')`
Update utf8proc 2015-12-27 22:38:37 +00:00			`{`
			`++ret;`
			`++it;`
			`}`
			`return ret;`
			`}`
Add handy UTF8Iterator class to utf8proc header 2015-12-20 21:13:32 +00:00			`};`

			`#endif`

stripped-down utf8proc added 2015-08-31 21:11:42 +00:00			`#endif`