|
|
@ -98,11 +98,11 @@ namespace |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
class unexpected_low_surrogate : public utf_exception |
|
|
|
class unexpected_surrogate : public utf_exception |
|
|
|
{ |
|
|
|
public: |
|
|
|
explicit unexpected_low_surrogate(char16_t c) : utf_exception(c) {} |
|
|
|
std::string reason() const override { return "Unexpected low surogate " + hex16(value); } |
|
|
|
explicit unexpected_surrogate(char16_t c) : utf_exception(c) {} |
|
|
|
std::string reason() const override { return "Unexpected surogate " + hex16(value); } |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
@ -134,6 +134,23 @@ namespace |
|
|
|
return ret; |
|
|
|
} |
|
|
|
|
|
|
|
std::string escape(pEp::u16string_view s) |
|
|
|
{ |
|
|
|
std::string ret; ret.reserve(s.size() + 16 ); |
|
|
|
for(char16_t c : s) |
|
|
|
{ |
|
|
|
if(c>=32 && c<=126) |
|
|
|
{ |
|
|
|
ret += char(c); |
|
|
|
}else{ |
|
|
|
char buf[16]; |
|
|
|
snprintf(buf,15, "«%04x»", c ); |
|
|
|
ret += buf; |
|
|
|
} |
|
|
|
} |
|
|
|
return ret; |
|
|
|
} |
|
|
|
|
|
|
|
// returns the "CanonicalCombinincClass" of the given Unicode codpoint u
|
|
|
|
unsigned canonicalClass(unsigned u) |
|
|
|
{ |
|
|
@ -312,7 +329,7 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end) |
|
|
|
}else{ |
|
|
|
if(u>=0xDC00) |
|
|
|
{ |
|
|
|
throw unexpected_low_surrogate(u); |
|
|
|
throw unexpected_surrogate(u); |
|
|
|
} |
|
|
|
++c; |
|
|
|
if(c==end) throw unexpected_end(u); |
|
|
@ -327,8 +344,26 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end) |
|
|
|
throw unexpected_end(-1); |
|
|
|
} |
|
|
|
|
|
|
|
template<class CharT> |
|
|
|
uint32_t parseUtf(const CharT*& c, const CharT* end); |
|
|
|
|
|
|
|
template<> |
|
|
|
inline |
|
|
|
uint32_t parseUtf<char>(const char*& c, const char* end) |
|
|
|
{ |
|
|
|
return parseUtf8(c,end); |
|
|
|
} |
|
|
|
|
|
|
|
template<> |
|
|
|
inline |
|
|
|
uint32_t parseUtf<char16_t>(const char16_t*& c, const char16_t* end) |
|
|
|
{ |
|
|
|
return parseUtf16(c,end); |
|
|
|
} |
|
|
|
|
|
|
|
void toUtf8(const char32_t c, std::string& ret) |
|
|
|
|
|
|
|
template<> |
|
|
|
void toUtf<char>(const char32_t c, std::string& ret) |
|
|
|
{ |
|
|
|
if(c<=0x7F) |
|
|
|
{ |
|
|
@ -353,24 +388,52 @@ void toUtf8(const char32_t c, std::string& ret) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
template<> |
|
|
|
void toUtf<char16_t>(const char32_t c, std::u16string& ret) |
|
|
|
{ |
|
|
|
if(c <= 0xFFFF) |
|
|
|
{ |
|
|
|
if(c>=0xD800 && c<=0xDFFF) |
|
|
|
{ |
|
|
|
throw unexpected_surrogate(c); |
|
|
|
}else{ |
|
|
|
ret += char16_t(c); |
|
|
|
} |
|
|
|
}else{ // surrogate pair
|
|
|
|
if(c>0x10FFFF) |
|
|
|
{ |
|
|
|
throw too_big(0, c); |
|
|
|
}else{ |
|
|
|
const uint32_t c_reduced = c - 0x10000; |
|
|
|
ret += char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
|
|
|
|
ret += char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
std::string toUtf8(const std::u32string& u32) |
|
|
|
template<class CharT> |
|
|
|
std::basic_string<CharT> toUtf(const std::u32string& u32) |
|
|
|
{ |
|
|
|
std::string ret; |
|
|
|
std::basic_string<CharT> ret; |
|
|
|
for(char32_t c : u32) |
|
|
|
{ |
|
|
|
toUtf8(c, ret); |
|
|
|
toUtf<CharT>(c, ret); |
|
|
|
} |
|
|
|
return ret; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
illegal_utf8::illegal_utf8( string_view s, unsigned position, const std::string& reason) |
|
|
|
|
|
|
|
illegal_utf::illegal_utf( string_view s, unsigned position, const std::string& reason) |
|
|
|
: std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason ) |
|
|
|
{} |
|
|
|
|
|
|
|
illegal_utf::illegal_utf( u16string_view s, unsigned position, const std::string& reason) |
|
|
|
: std::runtime_error( "Illegal UTF-16 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason ) |
|
|
|
{} |
|
|
|
|
|
|
|
|
|
|
|
illegal_utf8::illegal_utf8( const std::string& msg ) |
|
|
|
illegal_utf::illegal_utf( const std::string& msg ) |
|
|
|
: std::runtime_error( msg ) |
|
|
|
{} |
|
|
|
|
|
|
@ -389,21 +452,22 @@ void assert_utf8(string_view s) |
|
|
|
} |
|
|
|
catch(const utf_exception& e) |
|
|
|
{ |
|
|
|
throw illegal_utf8(s, begin - s.data(), e.reason()); |
|
|
|
throw illegal_utf(s, begin - s.data(), e.reason()); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// creates a NFD string from s
|
|
|
|
std::u32string fromUtf8_decompose(string_view s) |
|
|
|
template<class CharT> |
|
|
|
std::u32string fromUtf_decompose(basic_string_view<CharT> s) |
|
|
|
{ |
|
|
|
std::u32string u32s; |
|
|
|
u32s.reserve( static_cast<std::size_t>(s.size()*1.25) ); |
|
|
|
const char* begin = s.data(); |
|
|
|
const char* end = s.data() + s.size(); |
|
|
|
const CharT* begin = s.data(); |
|
|
|
const CharT* end = s.data() + s.size(); |
|
|
|
for(; begin<end; ++begin) |
|
|
|
{ |
|
|
|
unsigned u = parseUtf8(begin, end); |
|
|
|
unsigned u = parseUtf(begin, end); |
|
|
|
u32s += decompose_full(u); |
|
|
|
} |
|
|
|
canonicalOrdering(u32s); // works inplace.
|
|
|
@ -479,16 +543,17 @@ std::u32string createNFC(std::u32string nfd) |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
IsNFC isNFC_quick_check(string_view s) |
|
|
|
template<class CharT> |
|
|
|
IsNFC isNFC_quick_check(basic_string_view<CharT> s) |
|
|
|
{ |
|
|
|
const char* begin = s.data(); |
|
|
|
const char* const end = s.data() + s.size(); |
|
|
|
const CharT* begin = s.data(); |
|
|
|
const CharT* const end = s.data() + s.size(); |
|
|
|
try |
|
|
|
{ |
|
|
|
unsigned last_cc = 0; |
|
|
|
while(begin<end) |
|
|
|
{ |
|
|
|
const uint32_t u = parseUtf8(begin, end); |
|
|
|
const uint32_t u = parseUtf(begin, end); |
|
|
|
const unsigned cc = canonicalClass(u); |
|
|
|
if( (cc!=0) && (last_cc > cc) ) |
|
|
|
{ |
|
|
@ -502,13 +567,14 @@ IsNFC isNFC_quick_check(string_view s) |
|
|
|
} |
|
|
|
catch(const utf_exception& e) |
|
|
|
{ |
|
|
|
throw illegal_utf8(s, begin - s.data(), e.reason()); |
|
|
|
throw illegal_utf(s, begin - s.data(), e.reason()); |
|
|
|
} |
|
|
|
return IsNFC::Yes; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
bool isNFC(string_view s) |
|
|
|
template<class CharT> |
|
|
|
bool isNFC(basic_string_view<CharT> s) |
|
|
|
{ |
|
|
|
switch( isNFC_quick_check(s) ) |
|
|
|
{ |
|
|
@ -523,6 +589,11 @@ bool isNFC(string_view s) |
|
|
|
throw -1; // could never happen, but compiler is too dumb to see this.
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template bool isNFC<char>(string_view); |
|
|
|
template bool isNFC<char16_t>(u16string_view); |
|
|
|
|
|
|
|
|
|
|
|
bool isUtf8(const char* begin, const char* end) |
|
|
|
try{ |
|
|
|
for(; begin<end; ++begin) |
|
|
@ -530,18 +601,20 @@ try{ |
|
|
|
(void)parseUtf8(begin, end); |
|
|
|
} |
|
|
|
return true; |
|
|
|
}catch(const illegal_utf8&) |
|
|
|
}catch(const illegal_utf&) |
|
|
|
{ |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// s is ''moved'' to the return value if possible so no copy is done here.
|
|
|
|
std::string toNFC(string_view s) |
|
|
|
template<class CharT> |
|
|
|
std::basic_string<CharT> toNFC(basic_string_view<CharT> s) |
|
|
|
{ |
|
|
|
if(isNFC_quick_check(s)==IsNFC::Yes) |
|
|
|
return std::string{s}; |
|
|
|
return std::basic_string<CharT>{s}; |
|
|
|
|
|
|
|
return toUtf8( createNFC( fromUtf8_decompose(s) )); |
|
|
|
return toUtf<CharT>( createNFC( fromUtf_decompose(s) )); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|