// This file is under GNU General Public License 3.0 // see LICENSE.txt // converts a C++ string into NFC form #include "nfc.hh" #include #include #include #include #include "nfc_sets.hh" #include namespace { // unicode to hex string std::string u2h(unsigned u) { char buf[16] = {0}; snprintf(buf, 15, "", u ); return buf; } // octet to hex string std::string o2h(uint8_t octet) { char buf[16] = {0}; snprintf(buf, 15, "0x%02hhX", octet); return buf; } // hex string of a 16-bit value std::string hex16(char16_t u) { char buf[16] = {0}; snprintf(buf, 15, "0x%04X", u); return buf; } class utf_exception { public: utf_exception(uint16_t u) : octet(u), value(u) {} virtual ~utf_exception() = default; virtual std::string reason() const = 0; uint8_t octet; uint16_t value; }; class cont_without_start : public utf_exception { public: cont_without_start(uint8_t u) : utf_exception(u) {} std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; } }; class overlong_sequence : public utf_exception { public: overlong_sequence(uint8_t octet, unsigned u) : utf_exception(octet), unicode(u) {} std::string reason() const override { return "Overlong sequence for " + u2h(unicode); } unsigned unicode; }; class unexpected_end : public utf_exception { public: unexpected_end(uint8_t u) : utf_exception(u) {} std::string reason() const override { return "Unexpected end of string"; } }; class surrogate : public utf_exception { public: surrogate(uint8_t u, unsigned s) : utf_exception(u), surr(s) {} std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; } private: unsigned surr; }; class no_unicode : public utf_exception { public: explicit no_unicode(uint8_t _octet) : utf_exception(_octet) {} std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; } }; class too_big : public utf_exception { public: explicit too_big(uint8_t _octet, unsigned u) : utf_exception(_octet), unicode(u) {} std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; } unsigned unicode; }; class unexpected_surrogate : public utf_exception { public: explicit unexpected_surrogate(char16_t c) : utf_exception(c) {} std::string reason() const override { return "Unexpected surogate " + hex16(value); } }; class missing_low_surrogate : public utf_exception { public: explicit missing_low_surrogate(char16_t c, char16_t _surr) : utf_exception(c), surr(_surr) {} std::string reason() const override { return "Non-low surrogate value " + hex16(value) + " is unexpected after high surogate " + hex16(surr); } private: char16_t surr; }; std::string escape(std::string_view s) { std::string ret; ret.reserve(s.size() + 16 ); for(char c : s) { const uint8_t u = c; if(u>=32 && u<=126) { ret += c; }else{ char buf[16]; snprintf(buf,15, "«%02x»", u ); ret += buf; } } return ret; } std::string escape(std::u16string_view s) { std::string ret; ret.reserve(s.size() + 16 ); for(char16_t c : s) { if(c>=32 && c<=126) { ret += char(c); }else{ char buf[16]; snprintf(buf,15, "«%04x»", c ); ret += buf; } } return ret; } // returns the "CanonicalCombinincClass" of the given Unicode codpoint u unsigned canonicalClass(unsigned u) { const auto q = NFC_CombiningClass.find(u); if(q==NFC_CombiningClass.end()) { return 0; // not found in map. }else{ return q->second; } } std::pair decompose(unsigned u) { const auto q = NFC_Decompose.find(u); if(q==NFC_Decompose.end()) { return std::make_pair(-1, -1); }else{ return q->second; } } std::u32string decompose_full(unsigned u) { const std::pair d = decompose(u); if(d.first<0) { return std::u32string( 1, char32_t(u) ); }else{ if(d.second<0) { return decompose_full(d.first); } } return decompose_full(d.first) + decompose_full(d.second); } // according to Unicode Standard, clause D108: bool isReorderablePair(unsigned a, unsigned b) { const unsigned cca = canonicalClass(a); const unsigned ccb = canonicalClass(b); return (cca > ccb) && (ccb>0); } // Unicode standard requires bubble sort, for stability reasons? void canonicalOrdering(std::u32string& us) { if(us.size()<2) return; for(unsigned n=us.size(); n>1; --n) for(unsigned i=0; i=0xD800 && ret<=0xDFFF) throw surrogate(u, ret); return ret; } else if (u<=0xF4) // 4 octet sequence { ++c; if(c==end) throw unexpected_end(u); const uint8_t uu = uint8_t(*c); if((uu & 0xC0) != 0x80) { throw unexpected_end(uu); } ++c; if(c==end) throw unexpected_end(uu); const uint8_t uuu = uint8_t(*c); if((uuu & 0xC0) != 0x80) { throw unexpected_end(uuu); } ++c; if(c==end) throw unexpected_end(uuu); const uint8_t uuuu = uint8_t(*c); if((uuuu & 0xC0) != 0x80) { throw unexpected_end(uuuu); } const uint32_t ret = ((u & 0xF) << 18) + ((uu & 0x3F)<<12) + ((uuu & 0x3F)<<6) + (uuuu & 0x3F); if(ret<0x10000) throw overlong_sequence(u, ret); if(ret>0x10FFFF) throw too_big(u, ret); return ret; } else { throw no_unicode(u); } } throw unexpected_end(-1); } uint32_t parseUtf16(const char16_t*& c, const char16_t* end) { while(c=0xE000) { return u; }else{ if(u>=0xDC00) { throw unexpected_surrogate(u); } ++c; if(c==end) throw unexpected_end(u); const uint16_t low = *c; if(low < 0xDC00 || low > 0xDFFF) { throw missing_low_surrogate(low, u); } return (u-0xD800) * 1024 + (low-0xDC00) + 0x10000; } } throw unexpected_end(-1); } template<> uint32_t UTF::parse(const char*& c, const char* end) { return parseUtf8(c,end); } template<> uint32_t UTF::parse(const char16_t*& c, const char16_t* end) { return parseUtf16(c,end); } template<> template void UTF::generate(const char32_t c, OutIter& out) { if(c<=0x7F) { *out++ = char(c); }else if(c<=0x7FF) { *out++ = char( 0xC0 + (c>>6) ); *out++ = char( 0x80 + (c & 63)); }else if(c<=0xFFFF) { if(c>=0xD800 && c<=0xDFFF) { throw unexpected_surrogate(c); } *out++ = char( 0xE0 + (c>>12) ); *out++ = char( 0x80 + ((c>>6) & 63)); *out++ = char( 0x80 + (c & 63)); }else if(c<=0x10FFFF) { *out++ = char( 0xF0 + (c>>18) ); *out++ = char( 0x80 + ((c>>12) & 63)); *out++ = char( 0x80 + ((c>>6) & 63)); *out++ = char( 0x80 + (c & 63)); }else{ throw too_big(0, c); } } template<> template void UTF::generate(const char32_t c, OutIter& out) { if(c <= 0xFFFF) { if(c>=0xD800 && c<=0xDFFF) { throw unexpected_surrogate(c); }else{ *out++ = char16_t(c); } }else{ // surrogate pair if(c>0x10FFFF) { throw too_big(0, c); }else{ const uint32_t c_reduced = c - 0x10000; *out++ = char16_t(0xD800 + (c_reduced >> 10)); // High Surrogate *out++ = char16_t(0xDC00 + (c_reduced & 0x3FF)); // Low Surrogate } } } template std::basic_string UTF::generate(const std::u32string& u32) { std::basic_string ret; auto out = std::back_inserter(ret); for(char32_t c : u32) { generate(c, out); } return ret; } illegal_utf::illegal_utf( std::string_view s, unsigned position, const std::string& reason) : std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason ) {} illegal_utf::illegal_utf( std::u16string_view s, unsigned position, const std::string& reason) : std::runtime_error( "Illegal UTF-16 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason ) {} illegal_utf::illegal_utf( const std::string& msg ) : std::runtime_error( msg ) {} void assert_utf8(std::string_view s) { const char* begin = s.data(); const char* const end = s.data() + s.size(); try { while(begin std::u32string UTF::fromUtf_decompose(std::basic_string_view s) { std::u32string u32s; u32s.reserve( static_cast(s.size()*1.25) ); const CharT* begin = s.data(); const CharT* end = s.data() + s.size(); for(; begin bool blocked(Iter L, Iter C) { Iter B = L; ++B; for(;B!=C;++B) { if(canonicalClass(*B)==0 || canonicalClass(*B)==canonicalClass(*C)) return true; } return false; } template void combine(std::u32string& nfc, Iter starter, Iter next_starter) { Iter c = starter; ++c; for(;c!=next_starter; ++c) { if(!blocked(starter, c)) { const unsigned starter_u = *starter; const unsigned c_u = *c; auto q = NFC_Compose.find( std::make_pair(starter_u,c_u) ); if(q!=NFC_Compose.end()) { *starter = q->second; *c = -1; } } } // now add the remaining/changed characters to the NFC string: for(Iter c = starter; c!=next_starter; ++c) { if( int(*c) >= 0) { nfc += *c; } } } // the nfd string is changed during composing process. So it works on a copy or call with std::move(). std::u32string createNFC(std::u32string nfd) { if(nfd.size()<=1) return nfd; std::u32string nfc; nfc.reserve(nfd.size()); auto starter = nfd.begin(); while( starter != nfd.end() ) { if( canonicalClass(*starter)!=0 ) { nfc += *starter; ++starter; }else{ auto next_starter = std::find_if(starter+1, nfd.end(), [](char32_t c){return canonicalClass(c)==0;} ); combine(nfc, starter, next_starter); starter = next_starter; } } return nfc; } template bool UTF::is_safe_NFC_start(std::basic_string_view s) { if(s.empty() || (s[0] & 0x80)==0 ) // shortcut for empty string or starts with ASCII char { return true; } const CharT* begin = s.data(); const CharT* const end = s.data() + s.size(); try { const uint32_t u = parse(begin, end); if(NFC_No.count(u)) return false; if(NFC_Maybe.count(u)) return false; return true; } catch(const utf_exception& ue) { throw illegal_utf(s, begin-s.data(), ue.reason()); } } template IsNFC UTF::isNFC_quick_check(std::basic_string_view s) { const CharT* begin = s.data(); const CharT* const end = s.data() + s.size(); try { unsigned last_cc = 0; while(begin cc) ) { return IsNFC::No; } if(NFC_No.count(u)) return IsNFC::No; if(NFC_Maybe.count(u)) return IsNFC::Maybe; ++begin; last_cc = cc; } } catch(const utf_exception& e) { throw illegal_utf(s, begin - s.data(), e.reason()); } return IsNFC::Yes; } template bool UTF::isNFC(std::basic_string_view s) { switch( isNFC_quick_check(s) ) { case IsNFC::Yes : return true; case IsNFC::No : return false; case IsNFC::Maybe: { return s == toNFC(s); // very expensive! } } throw -1; // could never happen, but compiler is too dumb to see this. } template<> bool UTF::isUtf(const char* begin, const char* end) try{ for(; begin std::basic_string UTF::toNFC(std::basic_string_view s) { if(isNFC_quick_check(s)==IsNFC::Yes) return std::basic_string{s}; return generate( createNFC( fromUtf_decompose(s) )); } template<> size_t UTF::utf_length(std::u32string_view s) { size_t len = 0; for(const char32_t c : s) { if(c <= 0x7f) { len += 1; }else if(c<=0x7ff) { len += 2; }else if(c<=0xffff) { if(c>=0xD800 && c<=0xDFFF) { throw unexpected_surrogate(c); } len += 3; }else if(c<=0x10ffff) { len += 4; }else{ throw too_big(0, c); } } return len; } template<> size_t UTF::utf_length(std::u32string_view s) { size_t len = 0; for(const char32_t c : s) { if(c <= 0xffff) { if(c>=0xD800 && c<=0xDFFF) { throw unexpected_surrogate(c); } len += 1; }else if(c<=0x10ffff) { len += 2; }else{ throw too_big(0, c); } } return len; } template UTF::nfc_string::nfc_string(StringView src) : s{ UTF::toNFC(src) } {} template UTF::nfc_string::nfc_string(String&& src) : s{ isNFC_quick_check(src)==IsNFC::Yes ? std::move(src) : toNFC(src) } {} template typename UTF::nfc_string& UTF::nfc_string::assign(StringView src) { s = toNFC(src); return *this; } template typename UTF::nfc_string& UTF::nfc_string::assign(String&& src) { s = (isNFC_quick_check(src)==IsNFC::Yes) ? std::move(src) : toNFC(src); return *this; } template typename UTF::nfc_string& UTF::nfc_string::push_back(CharT c) { s += c; if( !is_safe_NFC_start(StringView{&c, 1}) ) { normalize(); } return *this; } template typename UTF::nfc_string& UTF::nfc_string::operator+=(StringView sv) { const String& sv_nfc = toNFC(sv); s += sv_nfc; if( !is_safe_NFC_start(sv_nfc) ) { normalize(); } return *this; } template typename UTF::nfc_string& UTF::nfc_string::operator+=(const UTF::nfc_string& ns) { s += ns.get(); if( !is_safe_NFC_start(ns) ) { normalize(); } return *this; } template bool UTF::nfc_string::starts_with(StringView sv) const noexcept { return (s.size() >= sv.size()) && (StringView{s.data(), sv.size()} == sv); } template bool UTF::nfc_string::ends_with(StringView sv) const noexcept { return (s.size() >= sv.size()) && (StringView{s.data() + s.size() - sv.size(), sv.size()} == sv); } template typename UTF::nfc_string UTF::nfc_string::substr(std::size_t pos, std::size_t count) const { return nfc_string{s.substr(pos,count)}; } template void UTF::nfc_string::normalize() { if(isNFC_quick_check(s) != IsNFC::Yes) { s = generate( createNFC( fromUtf_decompose(s) )); } } // convenience function to avoid ::strdup(pEp::toNFC(text).c_str()); // and unecessary temporary std::string etc. char* strdup_NFC(std::string_view s) { if(UTF8::isNFC_quick_check(s)==IsNFC::Yes) return ::new_string(s.data(), s.size()); // implement the hard way more efficient const std::u32string& u32 = createNFC( UTF8::fromUtf_decompose(s) ); const size_t out_len = UTF8::utf_length(u32); char* ret = ::new_string(nullptr, out_len ); char* iter{ret}; for(const char32_t c : u32) { UTF8::generate(c, iter); } if(iter > ret+out_len) // should never happen. ;) { throw std::logic_error("internal error: strdup_NFC() exceeded output string size"); } return ret; } pEp_identity *identity_dup_NFC(const ::pEp_identity* value) { ::pEp_identity* result = (::pEp_identity*) malloc(sizeof(::pEp_identity)); if (!result) throw std::bad_alloc(); memcpy(result, value, sizeof(::pEp_identity)); result->address = pEp::strdup_NFC(value->address); result->fpr = pEp::strdup_NFC(value->fpr); result->user_id = pEp::strdup_NFC(value->user_id); result->username = pEp::strdup_NFC(value->username); return result; } ::identity_list* identity_list_dup_NFC(const ::identity_list* value) { ::identity_list* result = ::new_identity_list(nullptr); if (!result) throw std::bad_alloc(); const ::identity_list* il = value; ::identity_list* ir = result; for (; il && il->ident; il = il->next) { ir = ::identity_list_add(ir, identity_dup_NFC(il->ident)); if (!ir) throw std::bad_alloc(); } return result; } template class UTF; template class UTF; // used only to initialize the NFC Compose mapping: std::map< std::pair, unsigned> generate_nfc_compose() { std::map< std::pair, unsigned> m; for(const auto& decomp : NFC_Decompose) { if(decomp.second.second >= 0) // skip singleton decompositions { m[ decomp.second ] = decomp.first; } } return m; } } // end of namespace pEp