Browse Source

add UTF-16 stuff, and try to re-use as much code as possible.

master
roker 4 years ago
parent
commit
d3747a8b1f
  1. 123
      src/nfc.cc
  2. 32
      src/nfc.hh

123
src/nfc.cc

@ -98,11 +98,11 @@ namespace
};
class unexpected_low_surrogate : public utf_exception
class unexpected_surrogate : public utf_exception
{
public:
explicit unexpected_low_surrogate(char16_t c) : utf_exception(c) {}
std::string reason() const override { return "Unexpected low surogate " + hex16(value); }
explicit unexpected_surrogate(char16_t c) : utf_exception(c) {}
std::string reason() const override { return "Unexpected surogate " + hex16(value); }
};
@ -134,6 +134,23 @@ namespace
return ret;
}
std::string escape(pEp::u16string_view s)
{
std::string ret; ret.reserve(s.size() + 16 );
for(char16_t c : s)
{
if(c>=32 && c<=126)
{
ret += char(c);
}else{
char buf[16];
snprintf(buf,15, "«%04x»", c );
ret += buf;
}
}
return ret;
}
// returns the "CanonicalCombinincClass" of the given Unicode codpoint u
unsigned canonicalClass(unsigned u)
{
@ -312,7 +329,7 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
}else{
if(u>=0xDC00)
{
throw unexpected_low_surrogate(u);
throw unexpected_surrogate(u);
}
++c;
if(c==end) throw unexpected_end(u);
@ -327,8 +344,26 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end)
throw unexpected_end(-1);
}
template<class CharT>
uint32_t parseUtf(const CharT*& c, const CharT* end);
template<>
inline
uint32_t parseUtf<char>(const char*& c, const char* end)
{
return parseUtf8(c,end);
}
template<>
inline
uint32_t parseUtf<char16_t>(const char16_t*& c, const char16_t* end)
{
return parseUtf16(c,end);
}
void toUtf8(const char32_t c, std::string& ret)
template<>
void toUtf<char>(const char32_t c, std::string& ret)
{
if(c<=0x7F)
{
@ -353,24 +388,52 @@ void toUtf8(const char32_t c, std::string& ret)
}
}
template<>
void toUtf<char16_t>(const char32_t c, std::u16string& ret)
{
if(c <= 0xFFFF)
{
if(c>=0xD800 && c<=0xDFFF)
{
throw unexpected_surrogate(c);
}else{
ret += char16_t(c);
}
}else{ // surrogate pair
if(c>0x10FFFF)
{
throw too_big(0, c);
}else{
const uint32_t c_reduced = c - 0x10000;
ret += char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate
ret += char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate
}
}
}
std::string toUtf8(const std::u32string& u32)
template<class CharT>
std::basic_string<CharT> toUtf(const std::u32string& u32)
{
std::string ret;
std::basic_string<CharT> ret;
for(char32_t c : u32)
{
toUtf8(c, ret);
toUtf<CharT>(c, ret);
}
return ret;
}
illegal_utf8::illegal_utf8( string_view s, unsigned position, const std::string& reason)
illegal_utf::illegal_utf( string_view s, unsigned position, const std::string& reason)
: std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
{}
illegal_utf::illegal_utf( u16string_view s, unsigned position, const std::string& reason)
: std::runtime_error( "Illegal UTF-16 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason )
{}
illegal_utf8::illegal_utf8( const std::string& msg )
illegal_utf::illegal_utf( const std::string& msg )
: std::runtime_error( msg )
{}
@ -389,21 +452,22 @@ void assert_utf8(string_view s)
}
catch(const utf_exception& e)
{
throw illegal_utf8(s, begin - s.data(), e.reason());
throw illegal_utf(s, begin - s.data(), e.reason());
}
}
// creates a NFD string from s
std::u32string fromUtf8_decompose(string_view s)
template<class CharT>
std::u32string fromUtf_decompose(basic_string_view<CharT> s)
{
std::u32string u32s;
u32s.reserve( static_cast<std::size_t>(s.size()*1.25) );
const char* begin = s.data();
const char* end = s.data() + s.size();
const CharT* begin = s.data();
const CharT* end = s.data() + s.size();
for(; begin<end; ++begin)
{
unsigned u = parseUtf8(begin, end);
unsigned u = parseUtf(begin, end);
u32s += decompose_full(u);
}
canonicalOrdering(u32s); // works inplace.
@ -479,16 +543,17 @@ std::u32string createNFC(std::u32string nfd)
}
IsNFC isNFC_quick_check(string_view s)
template<class CharT>
IsNFC isNFC_quick_check(basic_string_view<CharT> s)
{
const char* begin = s.data();
const char* const end = s.data() + s.size();
const CharT* begin = s.data();
const CharT* const end = s.data() + s.size();
try
{
unsigned last_cc = 0;
while(begin<end)
{
const uint32_t u = parseUtf8(begin, end);
const uint32_t u = parseUtf(begin, end);
const unsigned cc = canonicalClass(u);
if( (cc!=0) && (last_cc > cc) )
{
@ -502,13 +567,14 @@ IsNFC isNFC_quick_check(string_view s)
}
catch(const utf_exception& e)
{
throw illegal_utf8(s, begin - s.data(), e.reason());
throw illegal_utf(s, begin - s.data(), e.reason());
}
return IsNFC::Yes;
}
bool isNFC(string_view s)
template<class CharT>
bool isNFC(basic_string_view<CharT> s)
{
switch( isNFC_quick_check(s) )
{
@ -523,6 +589,11 @@ bool isNFC(string_view s)
throw -1; // could never happen, but compiler is too dumb to see this.
}
template bool isNFC<char>(string_view);
template bool isNFC<char16_t>(u16string_view);
bool isUtf8(const char* begin, const char* end)
try{
for(; begin<end; ++begin)
@ -530,18 +601,20 @@ try{
(void)parseUtf8(begin, end);
}
return true;
}catch(const illegal_utf8&)
}catch(const illegal_utf&)
{
return false;
}
// s is ''moved'' to the return value if possible so no copy is done here.
std::string toNFC(string_view s)
template<class CharT>
std::basic_string<CharT> toNFC(basic_string_view<CharT> s)
{
if(isNFC_quick_check(s)==IsNFC::Yes)
return std::string{s};
return std::basic_string<CharT>{s};
return toUtf8( createNFC( fromUtf8_decompose(s) ));
return toUtf<CharT>( createNFC( fromUtf_decompose(s) ));
}

32
src/nfc.hh

@ -21,43 +21,53 @@ enum class IsNFC
std::ostream& operator<<(std::ostream& o, IsNFC is_nfc);
class illegal_utf8 : public std::runtime_error
class illegal_utf : public std::runtime_error
{
public:
illegal_utf8(string_view, unsigned position, const std::string& reason);
illegal_utf( string_view, unsigned position, const std::string& reason);
illegal_utf(u16string_view, unsigned position, const std::string& reason);
protected:
explicit illegal_utf8(const std::string& message);
explicit illegal_utf(const std::string& message);
};
// scans the char sequences and parses UTF-8 sequences. Detect UTF-8 errors and throws exceptions.
uint32_t parseUtf8(const char*& c, const char* end);
// converts 'c' into a UTF-8 sequence and adds it to 'ret'
void toUtf8(const char32_t c, std::string& ret);
// converts 'c' into a UTF-8/UTF-16 sequence and adds that to 'ret'
template<class CharT>
void toUtf(const char32_t c, std::basic_string<CharT>& ret);
// throws illegal_utf8 exception if s is not valid UTF-8
void assert_utf8(string_view s);
// creates an NFD u32string from UTF-8 input string s
std::u32string fromUtf8_decompose(string_view s);
// creates an NFD u32string from UTF-8/UTF-16 input string s
template<class CharT>
std::u32string fromUtf_decompose(basic_string_view<CharT> s);
// convert NFD to NFC
std::u32string createNFC(std::u32string nfd_string);
// return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe"
// might throw illegal_utf8 exception
IsNFC isNFC_quick_check(string_view s);
// might throw illegal_utf exception
template<class CharT>
IsNFC isNFC_quick_check(basic_string_view<CharT> s);
// runs first quick check and a deep test if quick check returns "Maybe".
bool isNFC(string_view s);
template<class CharT>
bool isNFC(basic_string_view<CharT> s);
// returns true if the sequence is valid UTF-8
bool isUtf8(const char* begin, const char* end);
// converts a C++ string (in UTF-8) into NFC form
// s is ''moved'' to the return value if possible so no copy is done here.
std::string toNFC(string_view s);
template<class CharT>
std::basic_string<CharT> toNFC(basic_string_view<CharT> s);
// creates a UTF-8-encoded NFC string from s
std::string toNFC_8(u16string_view s);
} // end of namespace pEp

Loading…
Cancel
Save