|
|
@ -352,52 +352,50 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end) |
|
|
|
throw unexpected_end(-1); |
|
|
|
} |
|
|
|
|
|
|
|
template<class CharT> |
|
|
|
uint32_t parseUtf(const CharT*& c, const CharT* end); |
|
|
|
|
|
|
|
template<> |
|
|
|
inline |
|
|
|
uint32_t parseUtf<char>(const char*& c, const char* end) |
|
|
|
uint32_t UTF<char>::parse(const char*& c, const char* end) |
|
|
|
{ |
|
|
|
return parseUtf8(c,end); |
|
|
|
} |
|
|
|
|
|
|
|
template<> |
|
|
|
inline |
|
|
|
uint32_t parseUtf<char16_t>(const char16_t*& c, const char16_t* end) |
|
|
|
uint32_t UTF<char16_t>::parse(const char16_t*& c, const char16_t* end) |
|
|
|
{ |
|
|
|
return parseUtf16(c,end); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template<> |
|
|
|
void toUtf<char>(const char32_t c, std::string& ret) |
|
|
|
template<class OutIter> |
|
|
|
void UTF<char>::generate(const char32_t c, OutIter& out) |
|
|
|
{ |
|
|
|
if(c<=0x7F) |
|
|
|
{ |
|
|
|
ret += char(c); |
|
|
|
*out++ = char(c); |
|
|
|
}else if(c<=0x7FF) |
|
|
|
{ |
|
|
|
ret += char( 0xC0 + (c>>6) ); |
|
|
|
ret += char( 0x80 + (c & 63)); |
|
|
|
*out++ = char( 0xC0 + (c>>6) ); |
|
|
|
*out++ = char( 0x80 + (c & 63)); |
|
|
|
}else if(c<=0xFFFF) |
|
|
|
{ |
|
|
|
ret += char( 0xE0 + (c>>12) ); |
|
|
|
ret += char( 0x80 + ((c>>6) & 63)); |
|
|
|
ret += char( 0x80 + (c & 63)); |
|
|
|
*out++ = char( 0xE0 + (c>>12) ); |
|
|
|
*out++ = char( 0x80 + ((c>>6) & 63)); |
|
|
|
*out++ = char( 0x80 + (c & 63)); |
|
|
|
}else if(c<=0x10FFFF) |
|
|
|
{ |
|
|
|
ret += char( 0xF0 + (c>>18) ); |
|
|
|
ret += char( 0x80 + ((c>>12) & 63)); |
|
|
|
ret += char( 0x80 + ((c>>6) & 63)); |
|
|
|
ret += char( 0x80 + (c & 63)); |
|
|
|
*out++ = char( 0xF0 + (c>>18) ); |
|
|
|
*out++ = char( 0x80 + ((c>>12) & 63)); |
|
|
|
*out++ = char( 0x80 + ((c>>6) & 63)); |
|
|
|
*out++ = char( 0x80 + (c & 63)); |
|
|
|
}else{ |
|
|
|
throw too_big(0, c); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template<> |
|
|
|
void toUtf<char16_t>(const char32_t c, std::u16string& ret) |
|
|
|
template<class OutIter> |
|
|
|
void UTF<char16_t>::generate(const char32_t c, OutIter& out) |
|
|
|
{ |
|
|
|
if(c <= 0xFFFF) |
|
|
|
{ |
|
|
@ -405,7 +403,7 @@ void toUtf<char16_t>(const char32_t c, std::u16string& ret) |
|
|
|
{ |
|
|
|
throw unexpected_surrogate(c); |
|
|
|
}else{ |
|
|
|
ret += char16_t(c); |
|
|
|
*out++ = char16_t(c); |
|
|
|
} |
|
|
|
}else{ // surrogate pair
|
|
|
|
if(c>0x10FFFF) |
|
|
@ -413,19 +411,20 @@ void toUtf<char16_t>(const char32_t c, std::u16string& ret) |
|
|
|
throw too_big(0, c); |
|
|
|
}else{ |
|
|
|
const uint32_t c_reduced = c - 0x10000; |
|
|
|
ret += char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
|
|
|
|
ret += char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
|
|
|
|
*out++ = char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
|
|
|
|
*out++ = char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
template<class CharT> |
|
|
|
std::basic_string<CharT> toUtf(const std::u32string& u32) |
|
|
|
std::basic_string<CharT> UTF<CharT>::generate(const std::u32string& u32) |
|
|
|
{ |
|
|
|
std::basic_string<CharT> ret; |
|
|
|
auto out = std::back_inserter(ret); |
|
|
|
for(char32_t c : u32) |
|
|
|
{ |
|
|
|
toUtf<CharT>(c, ret); |
|
|
|
generate(c, out); |
|
|
|
} |
|
|
|
return ret; |
|
|
|
} |
|
|
@ -454,7 +453,7 @@ void assert_utf8(string_view s) |
|
|
|
{ |
|
|
|
while(begin<end) |
|
|
|
{ |
|
|
|
parseUtf8(begin, end); // ignore the output
|
|
|
|
UTF8::parse(begin, end); // ignore the output
|
|
|
|
++begin; |
|
|
|
} |
|
|
|
} |
|
|
@ -467,7 +466,7 @@ void assert_utf8(string_view s) |
|
|
|
|
|
|
|
// creates a NFD string from s
|
|
|
|
template<class CharT> |
|
|
|
std::u32string fromUtf_decompose(basic_string_view<CharT> s) |
|
|
|
std::u32string UTF<CharT>::fromUtf_decompose(basic_string_view<CharT> s) |
|
|
|
{ |
|
|
|
std::u32string u32s; |
|
|
|
u32s.reserve( static_cast<std::size_t>(s.size()*1.25) ); |
|
|
@ -475,7 +474,7 @@ std::u32string fromUtf_decompose(basic_string_view<CharT> s) |
|
|
|
const CharT* end = s.data() + s.size(); |
|
|
|
for(; begin<end; ++begin) |
|
|
|
{ |
|
|
|
unsigned u = parseUtf(begin, end); |
|
|
|
unsigned u = parse(begin, end); |
|
|
|
u32s += decompose_full(u); |
|
|
|
} |
|
|
|
canonicalOrdering(u32s); // works inplace.
|
|
|
@ -552,7 +551,7 @@ std::u32string createNFC(std::u32string nfd) |
|
|
|
|
|
|
|
|
|
|
|
template<class CharT> |
|
|
|
IsNFC isNFC_quick_check(basic_string_view<CharT> s) |
|
|
|
IsNFC UTF<CharT>::isNFC_quick_check(basic_string_view<CharT> s) |
|
|
|
{ |
|
|
|
const CharT* begin = s.data(); |
|
|
|
const CharT* const end = s.data() + s.size(); |
|
|
@ -561,7 +560,7 @@ IsNFC isNFC_quick_check(basic_string_view<CharT> s) |
|
|
|
unsigned last_cc = 0; |
|
|
|
while(begin<end) |
|
|
|
{ |
|
|
|
const uint32_t u = parseUtf(begin, end); |
|
|
|
const uint32_t u = parse(begin, end); |
|
|
|
const unsigned cc = canonicalClass(u); |
|
|
|
if( (cc!=0) && (last_cc > cc) ) |
|
|
|
{ |
|
|
@ -582,7 +581,7 @@ IsNFC isNFC_quick_check(basic_string_view<CharT> s) |
|
|
|
|
|
|
|
|
|
|
|
template<class CharT> |
|
|
|
bool isNFC(basic_string_view<CharT> s) |
|
|
|
bool UTF<CharT>::isNFC(basic_string_view<CharT> s) |
|
|
|
{ |
|
|
|
switch( isNFC_quick_check(s) ) |
|
|
|
{ |
|
|
@ -598,19 +597,13 @@ bool isNFC(basic_string_view<CharT> s) |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template bool isNFC<char>(string_view); |
|
|
|
template bool isNFC<char16_t>(u16string_view); |
|
|
|
|
|
|
|
// should be unecessary, but... well...
|
|
|
|
template std::string toNFC<char>(string_view); |
|
|
|
template std::u16string toNFC<char16_t>(u16string_view); |
|
|
|
|
|
|
|
|
|
|
|
bool isUtf8(const char* begin, const char* end) |
|
|
|
template<> |
|
|
|
bool UTF<char>::isUtf(const char* begin, const char* end) |
|
|
|
try{ |
|
|
|
for(; begin<end; ++begin) |
|
|
|
{ |
|
|
|
(void)parseUtf8(begin, end); |
|
|
|
(void)parse(begin, end); |
|
|
|
} |
|
|
|
return true; |
|
|
|
}catch(const illegal_utf&) |
|
|
@ -621,12 +614,39 @@ try{ |
|
|
|
|
|
|
|
// s is ''moved'' to the return value if possible so no copy is done here.
|
|
|
|
template<class CharT> |
|
|
|
std::basic_string<CharT> toNFC(basic_string_view<CharT> s) |
|
|
|
std::basic_string<CharT> UTF<CharT>::toNFC(basic_string_view<CharT> s) |
|
|
|
{ |
|
|
|
if(isNFC_quick_check(s)==IsNFC::Yes) |
|
|
|
return std::basic_string<CharT>{s}; |
|
|
|
|
|
|
|
return toUtf<CharT>( createNFC( fromUtf_decompose(s) )); |
|
|
|
return generate( createNFC( fromUtf_decompose(s) )); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template<> |
|
|
|
size_t UTF<char>::utf_length(u32string_view s) |
|
|
|
{ |
|
|
|
size_t len = 0; |
|
|
|
for(const char32_t c : s) |
|
|
|
{ |
|
|
|
if(c <= 0x7f) |
|
|
|
{ |
|
|
|
len += 1; |
|
|
|
}else if(c<=0x7ff) |
|
|
|
{ |
|
|
|
len += 2; |
|
|
|
}else if(c<=0xffff) |
|
|
|
{ |
|
|
|
len += 3; |
|
|
|
}else if(c<=0x10ffff) |
|
|
|
{ |
|
|
|
len += 4; |
|
|
|
}else{ |
|
|
|
throw too_big(0, c); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return len; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -634,19 +654,17 @@ std::basic_string<CharT> toNFC(basic_string_view<CharT> s) |
|
|
|
// and unecessary temporary std::string etc.
|
|
|
|
char* strdup_NFC(string_view s) |
|
|
|
{ |
|
|
|
if(isNFC_quick_check(s)==IsNFC::Yes) |
|
|
|
if(UTF8::isNFC_quick_check(s)==IsNFC::Yes) |
|
|
|
return ::new_string(s.data(), s.size()); |
|
|
|
|
|
|
|
// implement the hard way more efficient
|
|
|
|
/********** FIXME: need more re-work, so I'll do the dumb way first
|
|
|
|
|
|
|
|
const std::u32string& u32 = createNFC( fromUtf_decompose(s) ); |
|
|
|
const size_t out_len = utf8len(u32); |
|
|
|
const std::u32string& u32 = createNFC( UTF8::fromUtf_decompose(s) ); |
|
|
|
const size_t out_len = UTF8::utf_length(u32); |
|
|
|
char* ret = ::new_string(nullptr, out_len ); |
|
|
|
char* iter{ret}; |
|
|
|
for(const char32_t c : u32) |
|
|
|
{ |
|
|
|
toUtf<char, char*>(c, iter); |
|
|
|
UTF8::generate(c, iter); |
|
|
|
} |
|
|
|
|
|
|
|
if(iter > ret+out_len) // should never happen. ;)
|
|
|
@ -655,14 +673,13 @@ char* strdup_NFC(string_view s) |
|
|
|
} |
|
|
|
|
|
|
|
return ret; |
|
|
|
********************/ |
|
|
|
|
|
|
|
// Correct but inefficient:
|
|
|
|
const std::string ret = toNFC<char>(s); |
|
|
|
return ::new_string(ret.data(), 0); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template class UTF<char>; |
|
|
|
template class UTF<char16_t>; |
|
|
|
|
|
|
|
|
|
|
|
// used only to initialize the NFC Compose mapping:
|
|
|
|
std::map< std::pair<unsigned, unsigned>, unsigned> generate_nfc_compose() |
|
|
|
{ |
|
|
|