From d3747a8b1fcc749fe7a2d5545b4006c32ead35d1 Mon Sep 17 00:00:00 2001 From: roker Date: Tue, 6 Jul 2021 13:23:10 +0200 Subject: [PATCH] add UTF-16 stuff, and try to re-use as much code as possible. --- src/nfc.cc | 123 ++++++++++++++++++++++++++++++++++++++++++----------- src/nfc.hh | 32 +++++++++----- 2 files changed, 119 insertions(+), 36 deletions(-) diff --git a/src/nfc.cc b/src/nfc.cc index f3d7c0a..8d0318c 100644 --- a/src/nfc.cc +++ b/src/nfc.cc @@ -98,11 +98,11 @@ namespace }; - class unexpected_low_surrogate : public utf_exception + class unexpected_surrogate : public utf_exception { public: - explicit unexpected_low_surrogate(char16_t c) : utf_exception(c) {} - std::string reason() const override { return "Unexpected low surogate " + hex16(value); } + explicit unexpected_surrogate(char16_t c) : utf_exception(c) {} + std::string reason() const override { return "Unexpected surogate " + hex16(value); } }; @@ -134,6 +134,23 @@ namespace return ret; } + std::string escape(pEp::u16string_view s) + { + std::string ret; ret.reserve(s.size() + 16 ); + for(char16_t c : s) + { + if(c>=32 && c<=126) + { + ret += char(c); + }else{ + char buf[16]; + snprintf(buf,15, "«%04x»", c ); + ret += buf; + } + } + return ret; + } + // returns the "CanonicalCombinincClass" of the given Unicode codpoint u unsigned canonicalClass(unsigned u) { @@ -312,7 +329,7 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end) }else{ if(u>=0xDC00) { - throw unexpected_low_surrogate(u); + throw unexpected_surrogate(u); } ++c; if(c==end) throw unexpected_end(u); @@ -327,8 +344,26 @@ uint32_t parseUtf16(const char16_t*& c, const char16_t* end) throw unexpected_end(-1); } +template +uint32_t parseUtf(const CharT*& c, const CharT* end); + +template<> +inline +uint32_t parseUtf(const char*& c, const char* end) +{ + return parseUtf8(c,end); +} + +template<> +inline +uint32_t parseUtf(const char16_t*& c, const char16_t* end) +{ + return parseUtf16(c,end); +} -void toUtf8(const char32_t c, std::string& ret) + +template<> +void toUtf(const char32_t c, std::string& ret) { if(c<=0x7F) { @@ -353,24 +388,52 @@ void toUtf8(const char32_t c, std::string& ret) } } +template<> +void toUtf(const char32_t c, std::u16string& ret) +{ + if(c <= 0xFFFF) + { + if(c>=0xD800 && c<=0xDFFF) + { + throw unexpected_surrogate(c); + }else{ + ret += char16_t(c); + } + }else{ // surrogate pair + if(c>0x10FFFF) + { + throw too_big(0, c); + }else{ + const uint32_t c_reduced = c - 0x10000; + ret += char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate + ret += char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate + } + } +} -std::string toUtf8(const std::u32string& u32) +template +std::basic_string toUtf(const std::u32string& u32) { - std::string ret; + std::basic_string ret; for(char32_t c : u32) { - toUtf8(c, ret); + toUtf(c, ret); } return ret; } -illegal_utf8::illegal_utf8( string_view s, unsigned position, const std::string& reason) + +illegal_utf::illegal_utf( string_view s, unsigned position, const std::string& reason) : std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason ) {} +illegal_utf::illegal_utf( u16string_view s, unsigned position, const std::string& reason) +: std::runtime_error( "Illegal UTF-16 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason ) +{} + -illegal_utf8::illegal_utf8( const std::string& msg ) +illegal_utf::illegal_utf( const std::string& msg ) : std::runtime_error( msg ) {} @@ -389,21 +452,22 @@ void assert_utf8(string_view s) } catch(const utf_exception& e) { - throw illegal_utf8(s, begin - s.data(), e.reason()); + throw illegal_utf(s, begin - s.data(), e.reason()); } } // creates a NFD string from s -std::u32string fromUtf8_decompose(string_view s) +template +std::u32string fromUtf_decompose(basic_string_view s) { std::u32string u32s; u32s.reserve( static_cast(s.size()*1.25) ); - const char* begin = s.data(); - const char* end = s.data() + s.size(); + const CharT* begin = s.data(); + const CharT* end = s.data() + s.size(); for(; begin +IsNFC isNFC_quick_check(basic_string_view s) { - const char* begin = s.data(); - const char* const end = s.data() + s.size(); + const CharT* begin = s.data(); + const CharT* const end = s.data() + s.size(); try { unsigned last_cc = 0; while(begin cc) ) { @@ -502,13 +567,14 @@ IsNFC isNFC_quick_check(string_view s) } catch(const utf_exception& e) { - throw illegal_utf8(s, begin - s.data(), e.reason()); + throw illegal_utf(s, begin - s.data(), e.reason()); } return IsNFC::Yes; } -bool isNFC(string_view s) +template +bool isNFC(basic_string_view s) { switch( isNFC_quick_check(s) ) { @@ -523,6 +589,11 @@ bool isNFC(string_view s) throw -1; // could never happen, but compiler is too dumb to see this. } + +template bool isNFC(string_view); +template bool isNFC(u16string_view); + + bool isUtf8(const char* begin, const char* end) try{ for(; begin +std::basic_string toNFC(basic_string_view s) { if(isNFC_quick_check(s)==IsNFC::Yes) - return std::string{s}; + return std::basic_string{s}; - return toUtf8( createNFC( fromUtf8_decompose(s) )); + return toUtf( createNFC( fromUtf_decompose(s) )); } diff --git a/src/nfc.hh b/src/nfc.hh index 433af50..0e12ebd 100644 --- a/src/nfc.hh +++ b/src/nfc.hh @@ -21,43 +21,53 @@ enum class IsNFC std::ostream& operator<<(std::ostream& o, IsNFC is_nfc); -class illegal_utf8 : public std::runtime_error +class illegal_utf : public std::runtime_error { public: - illegal_utf8(string_view, unsigned position, const std::string& reason); + illegal_utf( string_view, unsigned position, const std::string& reason); + illegal_utf(u16string_view, unsigned position, const std::string& reason); protected: - explicit illegal_utf8(const std::string& message); + explicit illegal_utf(const std::string& message); }; // scans the char sequences and parses UTF-8 sequences. Detect UTF-8 errors and throws exceptions. uint32_t parseUtf8(const char*& c, const char* end); -// converts 'c' into a UTF-8 sequence and adds it to 'ret' -void toUtf8(const char32_t c, std::string& ret); +// converts 'c' into a UTF-8/UTF-16 sequence and adds that to 'ret' +template +void toUtf(const char32_t c, std::basic_string& ret); // throws illegal_utf8 exception if s is not valid UTF-8 void assert_utf8(string_view s); -// creates an NFD u32string from UTF-8 input string s -std::u32string fromUtf8_decompose(string_view s); +// creates an NFD u32string from UTF-8/UTF-16 input string s +template +std::u32string fromUtf_decompose(basic_string_view s); // convert NFD to NFC std::u32string createNFC(std::u32string nfd_string); // return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe" -// might throw illegal_utf8 exception -IsNFC isNFC_quick_check(string_view s); +// might throw illegal_utf exception +template +IsNFC isNFC_quick_check(basic_string_view s); // runs first quick check and a deep test if quick check returns "Maybe". -bool isNFC(string_view s); +template +bool isNFC(basic_string_view s); // returns true if the sequence is valid UTF-8 bool isUtf8(const char* begin, const char* end); // converts a C++ string (in UTF-8) into NFC form // s is ''moved'' to the return value if possible so no copy is done here. -std::string toNFC(string_view s); +template +std::basic_string toNFC(basic_string_view s); + +// creates a UTF-8-encoded NFC string from s +std::string toNFC_8(u16string_view s); + } // end of namespace pEp