stripped-down utf8proc added

This commit is contained in:
Jack Andersen
2015-08-31 11:11:42 -10:00
parent 9c44d5f4de
commit bca146dbfc
8 changed files with 360 additions and 2215 deletions

View File

@@ -7,15 +7,9 @@
* Any changes to the types or namespacing must be reflected in 'atdna/main.cpp'
*/
#if _WIN32
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN 1
#endif
#include <windows.h>
#endif
#include <string.h>
#include <yaml.h>
#include <utf8proc.h>
#include "DNA.hpp"
namespace Athena
@@ -381,44 +375,40 @@ inline std::unique_ptr<YAMLNode> ValToNode(const char* val)
template <>
inline std::wstring NodeToVal(const YAMLNode* node)
{
#if _WIN32
int len = MultiByteToWideChar(CP_UTF8, 0, node->m_scalarString.c_str(), node->m_scalarString.size(), nullptr, 0);
std::wstring retval(len, L'\0');
MultiByteToWideChar(CP_UTF8, 0, node->m_scalarString.c_str(), node->m_scalarString.size(), &retval[0], len);
return retval;
#else
std::wstring retval;
retval.reserve(node->m_scalarString.length());
const char* buf = node->m_scalarString.c_str();
std::mbstate_t state = {};
const utf8proc_uint8_t* buf = reinterpret_cast<const utf8proc_uint8_t*>(node->m_scalarString.c_str());
while (*buf)
{
wchar_t wc;
buf += std::mbrtowc(&wc, buf, MB_LEN_MAX, &state);
retval += wc;
utf8proc_int32_t wc;
utf8proc_ssize_t len = utf8proc_iterate(buf, -1, &wc);
if (len < 0)
{
atWarning("invalid UTF-8 character while decoding");
return retval;
}
buf += len;
retval += wchar_t(wc);
}
return retval;
#endif
}
template <>
inline std::unique_ptr<YAMLNode> ValToNode(const std::wstring& val)
{
YAMLNode* ret = new YAMLNode(YAML_SCALAR_NODE);
#if _WIN32
int len = WideCharToMultiByte(CP_UTF8, 0, val.c_str(), val.size(), nullptr, 0, nullptr, nullptr);
ret->m_scalarString.assign(len, '\0');
WideCharToMultiByte(CP_UTF8, 0, val.c_str(), val.size(), &ret->m_scalarString[0], len, nullptr, nullptr);
#else
ret->m_scalarString.reserve(val.length());
std::mbstate_t state = {};
for (wchar_t ch : val)
{
char mb[MB_LEN_MAX];
int c = std::wcrtomb(mb, ch, &state);
ret->m_scalarString.append(mb, c);
utf8proc_uint8_t mb[4];
utf8proc_ssize_t c = utf8proc_encode_char(utf8proc_int32_t(ch), mb);
if (c < 0)
{
atWarning("invalid UTF-8 character while encoding");
return std::unique_ptr<YAMLNode>(ret);
}
ret->m_scalarString.append(reinterpret_cast<char*>(mb), c);
}
#endif
return std::unique_ptr<YAMLNode>(ret);
}

View File

@@ -1,15 +1,9 @@
#ifndef ISTREAMREADER_HPP
#define ISTREAMREADER_HPP
#if _WIN32
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN 1
#endif
#include <windows.h>
#endif
#include <memory>
#include <functional>
#include "utf8proc.h"
#include "IStream.hpp"
namespace Athena
@@ -597,32 +591,10 @@ public:
*/
inline std::string readWStringAsString(atInt32 fixedLen = -1)
{
#if _WIN32
std::wstring wstr;
atUint16 chr = readUint16();
atInt32 i;
for (i=0 ;; ++i)
{
if (fixedLen >= 0 && i >= fixedLen - 1)
break;
if (!chr)
break;
wstr += chr;
chr = readUint16();
}
int len = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), wstr.size(), nullptr, 0, nullptr, nullptr);
std::string retval(len, '\0');
WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), wstr.size(), &retval[0], len, nullptr, nullptr);
#else
std::string retval;
atUint16 chr = readUint16();
atInt32 i;
std::mbstate_t state = {};
for (i=0 ;; ++i)
{
if (fixedLen >= 0 && i >= fixedLen - 1)
@@ -631,12 +603,18 @@ public:
if (!chr)
break;
char mb[MB_LEN_MAX];
int c = std::wcrtomb(mb, chr, &state);
retval.append(mb, c);
utf8proc_uint8_t mb[4];
utf8proc_ssize_t c = utf8proc_encode_char(utf8proc_int32_t(chr), mb);
if (c < 0)
{
atWarning("invalid UTF-8 character while encoding");
return retval;
}
retval.append(reinterpret_cast<char*>(mb), c);
chr = readUint16();
}
#endif
if (fixedLen >= 0 && i < fixedLen)
seek(fixedLen - i);
@@ -645,32 +623,10 @@ public:
inline std::string readWStringAsStringLittle(atInt32 fixedLen = -1)
{
#if _WIN32
std::wstring wstr;
atUint16 chr = readUint16Little();
atInt32 i;
for (i=0 ;; ++i)
{
if (fixedLen >= 0 && i >= fixedLen - 1)
break;
if (!chr)
break;
wstr += chr;
chr = readUint16Little();
}
int len = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), wstr.size(), nullptr, 0, nullptr, nullptr);
std::string retval(len, '\0');
WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), wstr.size(), &retval[0], len, nullptr, nullptr);
#else
std::string retval;
atUint16 chr = readUint16Little();
atInt32 i;
std::mbstate_t state = {};
for (i=0 ;; ++i)
{
if (fixedLen >= 0 && i >= fixedLen - 1)
@@ -679,12 +635,18 @@ public:
if (!chr)
break;
char mb[MB_LEN_MAX];
int c = std::wcrtomb(mb, chr, &state);
retval.append(mb, c);
utf8proc_uint8_t mb[4];
utf8proc_ssize_t c = utf8proc_encode_char(utf8proc_int32_t(chr), mb);
if (c < 0)
{
atWarning("invalid UTF-8 character while encoding");
return retval;
}
retval.append(reinterpret_cast<char*>(mb), c);
chr = readUint16Little();
}
#endif
if (fixedLen >= 0 && i < fixedLen)
seek(fixedLen - i);
@@ -693,32 +655,10 @@ public:
inline std::string readWStringAsStringBig(atInt32 fixedLen = -1)
{
#if _WIN32
std::wstring wstr;
atUint16 chr = readUint16Big();
atInt32 i;
for (i=0 ;; ++i)
{
if (fixedLen >= 0 && i >= fixedLen - 1)
break;
if (!chr)
break;
wstr += chr;
chr = readUint16Big();
}
int len = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), wstr.size(), nullptr, 0, nullptr, nullptr);
std::string retval(len, '\0');
WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), wstr.size(), &retval[0], len, nullptr, nullptr);
#else
std::string retval;
atUint16 chr = readUint16Big();
atInt32 i;
std::mbstate_t state = {};
for (i = 0 ;; ++i)
{
if (fixedLen >= 0 && i >= fixedLen - 1)
@@ -727,12 +667,18 @@ public:
if (!chr)
break;
char mb[MB_LEN_MAX];
int c = std::wcrtomb(mb, chr, &state);
retval.append(mb, c);
utf8proc_uint8_t mb[4];
utf8proc_ssize_t c = utf8proc_encode_char(utf8proc_int32_t(chr), mb);
if (c < 0)
{
atWarning("invalid UTF-8 character while encoding");
return retval;
}
retval.append(reinterpret_cast<char*>(mb), c);
chr = readUint16Big();
}
#endif
if (fixedLen >= 0 && i < fixedLen)
seek(fixedLen - i);

View File

@@ -1,13 +1,7 @@
#ifndef ISTREAMWRITER_HPP
#define ISTREAMWRITER_HPP
#if _WIN32
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN 1
#endif
#include <windows.h>
#endif
#include "utf8proc.h"
#include "IStream.hpp"
namespace Athena
@@ -444,44 +438,21 @@ public:
inline void writeStringAsWString(const std::string& str, atInt32 fixedLen = -1)
{
std::string tmpStr = "\xEF\xBB\xBF" + str;
#if _WIN32
int len = MultiByteToWideChar(CP_UTF8, 0, tmpStr.c_str(), tmpStr.size(), nullptr, 0);
std::wstring retval(len, L'\0');
MultiByteToWideChar(CP_UTF8, 0, tmpStr.c_str(), tmpStr.size(), &retval[0], len);
if (fixedLen < 0)
{
for (wchar_t ch : retval)
{
if (ch != 0xFEFF)
writeUint16(ch);
}
writeUint16(0);
}
else
{
for (atInt32 i=0 ; i<fixedLen ; ++i)
{
wchar_t wc = retval[i];
if (wc == 0xFEFF)
{
--i;
continue;
}
writeUint16(wc);
}
}
#else
const char* buf = tmpStr.c_str();
std::mbstate_t state = {};
const utf8proc_uint8_t* buf = reinterpret_cast<const utf8proc_uint8_t*>(tmpStr.c_str());
if (fixedLen < 0)
{
while (*buf)
{
wchar_t wc;
buf += std::mbrtowc(&wc, buf, MB_LEN_MAX, &state);
utf8proc_int32_t wc;
utf8proc_ssize_t len = utf8proc_iterate(buf, -1, &wc);
if (len < 0)
{
atWarning("invalid UTF-8 character while decoding");
return;
}
buf += len;
if (wc != 0xFEFF)
writeUint16(wc);
writeUint16(atUint16(wc));
}
writeUint16(0);
}
@@ -489,9 +460,17 @@ public:
{
for (atInt32 i=0 ; i<fixedLen ; ++i)
{
wchar_t wc = 0;
utf8proc_int32_t wc = 0;
if (*buf)
buf += std::mbrtowc(&wc, buf, MB_LEN_MAX, &state);
{
utf8proc_ssize_t len = utf8proc_iterate(buf, -1, &wc);
if (len < 0)
{
atWarning("invalid UTF-8 character while decoding");
return;
}
buf += len;
}
if (wc == 0xFEFF)
{
@@ -499,53 +478,29 @@ public:
continue;
}
writeUint16(wc);
writeUint16(atUint16(wc));
}
}
#endif
}
inline void writeStringAsWStringLittle(const std::string& str, atInt32 fixedLen = -1)
{
std::string tmpStr = "\xEF\xBB\xBF" + str;
#if _WIN32
int len = MultiByteToWideChar(CP_UTF8, 0, tmpStr.c_str(), tmpStr.size(), nullptr, 0);
std::wstring retval(len, L'\0');
MultiByteToWideChar(CP_UTF8, 0, tmpStr.c_str(), tmpStr.size(), &retval[0], len);
if (fixedLen < 0)
{
for (wchar_t ch : retval)
{
if (ch != 0xFEFF)
writeUint16(ch);
}
writeUint16Little(0);
}
else
{
for (atInt32 i = 0; i<fixedLen; ++i)
{
wchar_t wc = retval[i];
if (wc == 0xFEFF)
{
--i;
continue;
}
writeUint16Little(wc);
}
}
#else
const char* buf = tmpStr.c_str();
std::mbstate_t state = {};
const utf8proc_uint8_t* buf = reinterpret_cast<const utf8proc_uint8_t*>(tmpStr.c_str());
if (fixedLen < 0)
{
while (*buf)
{
wchar_t wc;
buf += std::mbrtowc(&wc, buf, MB_LEN_MAX, &state);
utf8proc_int32_t wc;
utf8proc_ssize_t len = utf8proc_iterate(buf, -1, &wc);
if (len < 0)
{
atWarning("invalid UTF-8 character while decoding");
return;
}
buf += len;
if (wc != 0xFEFF)
writeUint16Little(wc);
writeUint16Little(atUint16(wc));
}
writeUint16Little(0);
}
@@ -553,9 +508,17 @@ public:
{
for (atInt32 i=0 ; i<fixedLen ; ++i)
{
wchar_t wc = 0;
utf8proc_int32_t wc = 0;
if (*buf)
buf += std::mbrtowc(&wc, buf, MB_LEN_MAX, &state);
{
utf8proc_ssize_t len = utf8proc_iterate(buf, -1, &wc);
if (len < 0)
{
atWarning("invalid UTF-8 character while decoding");
return;
}
buf += len;
}
if (wc == 0xFEFF)
{
@@ -563,53 +526,29 @@ public:
continue;
}
writeUint16Little(wc);
writeUint16Little(atUint16(wc));
}
}
#endif
}
inline void writeStringAsWStringBig(const std::string& str, atInt32 fixedLen = -1)
{
std::string tmpStr = "\xEF\xBB\xBF" + str;
#if _WIN32
int len = MultiByteToWideChar(CP_UTF8, 0, tmpStr.c_str(), tmpStr.size(), nullptr, 0);
std::wstring retval(len, L'\0');
MultiByteToWideChar(CP_UTF8, 0, tmpStr.c_str(), tmpStr.size(), &retval[0], len);
if (fixedLen < 0)
{
for (wchar_t ch : retval)
{
if (ch != 0xFEFF)
writeUint16(ch);
}
writeUint16Big(0);
}
else
{
for (atInt32 i = 0; i<fixedLen; ++i)
{
wchar_t wc = retval[i];
if (wc == 0xFEFF)
{
--i;
continue;
}
writeUint16Big(wc);
}
}
#else
const char* buf = tmpStr.c_str();
std::mbstate_t state = {};
const utf8proc_uint8_t* buf = reinterpret_cast<const utf8proc_uint8_t*>(tmpStr.c_str());
if (fixedLen < 0)
{
while (*buf)
{
wchar_t wc;
buf += std::mbrtowc(&wc, buf, MB_LEN_MAX, &state);
utf8proc_int32_t wc;
utf8proc_ssize_t len = utf8proc_iterate(buf, -1, &wc);
if (len < 0)
{
atWarning("invalid UTF-8 character while decoding");
return;
}
buf += len;
if (wc != 0xFEFF)
writeUint16Big(wc);
writeUint16Big(atUint16(wc));
}
writeUint16Big(0);
}
@@ -617,9 +556,17 @@ public:
{
for (atInt32 i=0 ; i<fixedLen ; ++i)
{
wchar_t wc = 0;
utf8proc_int32_t wc = 0;
if (*buf)
buf += std::mbrtowc(&wc, buf, MB_LEN_MAX, &state);
{
utf8proc_ssize_t len = utf8proc_iterate(buf, -1, &wc);
if (len < 0)
{
atWarning("invalid UTF-8 character while decoding");
return;
}
buf += len;
}
if (wc == 0xFEFF)
{
@@ -627,10 +574,9 @@ public:
continue;
}
writeUint16Big(wc);
writeUint16Big(atUint16(wc));
}
}
#endif
}
/*! \brief Writes an string to the buffer and advances the buffer.