// This file is under GNU General Public License 3.0 // see LICENSE.txt #ifndef LIBPEPDATATYPES_NFC_HH #define LIBPEPDATATYPES_NFC_HH #include #include #include #include #include #include namespace pEp { /// Tri-sate return value of isNFC_quick_check() enum class IsNFC { No=0, //!< string contains a character that cannot occur in NFC Maybe=1, //!< string contains a character that is only allowed in certain positions in NFC Yes=2 //!< string contains no invalid or partially valid character }; std::ostream& operator<<(std::ostream& o, IsNFC is_nfc); /// Exception class thrown whenever a string is parsed that is not a valid /// UTF-8 or UTF-16 sequence. class illegal_utf : public std::runtime_error { public: illegal_utf( std::string_view, unsigned position, const std::string& reason); illegal_utf(std::u16string_view, unsigned position, const std::string& reason); explicit illegal_utf(const std::string& message); }; /// Common class template to define the same functions for all 3 Unicode Transfer Formats. template class UTF { public: /// parses a sequence of input code units into one Unicode code point and updates the input iterator c. /// \todo change to iterator templates? static uint32_t parse(const CharT*& c, const CharT* end); /// generates a UTF sequence from a given Unicode code point. template static void generate(const char32_t c, OutIter& out); /// returns whether the sequence starts with IsNFC==Yes char static bool is_safe_NFC_start(std::basic_string_view s); /// returns No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe". /// use isNFC() for a comprehensive NFC check. /// Might throw illegal_utf exception static IsNFC isNFC_quick_check(std::basic_string_view s); /// runs first quick check and a deep test if quick check returns "Maybe". static bool isNFC(std::basic_string_view s); /// returns true if the sequence is valid UTF-8 static bool isUtf(const CharT* begin, const CharT* end); /// converts a C++ string (in UTF-8/-16) into NFC form static std::basic_string toNFC(std::basic_string_view s); /// calculates the number of "code units" in the target Unicode Transfer Format. static size_t utf_length(std::u32string_view s); /// generates a whole u32string at once static std::basic_string generate(const std::u32string& s); /// creates an NFD u32string from UTF-8/UTF-16 input string s static std::u32string fromUtf_decompose(std::basic_string_view s); /// class holding a NFC-conform Unicode string. /// content is mostly read-only, because arbitrary modifications might destroy NFC conformacy. class nfc_string : public boost::totally_ordered2> { public: typedef std::basic_string String; typedef std::basic_string_view StringView; /// only const_reference is supported. typedef typename String::const_reference const_reference; typedef typename String::const_pointer const_pointer; /// only forward iterator. Does a backward_iterator make sense in UTF-encoded strings? typedef typename String::const_iterator const_iterator; static constexpr size_t npos = String::npos; explicit nfc_string(StringView src); explicit nfc_string(String && src); /// construct from a NUL-terminated src explicit nfc_string(const CharT* src) : nfc_string{ StringView{src} } {} nfc_string(const CharT* src, size_t length) : nfc_string{ StringView{src, length} } {} nfc_string() = default; nfc_string(const nfc_string& src) = default; nfc_string( nfc_string&& src) = default; nfc_string& operator=(const nfc_string& src) = default; nfc_string& operator=( nfc_string&& src) = default; nfc_string& assign(StringView src); nfc_string& assign(String && src); nfc_string& assign(const CharT* src) { return this->assign(StringView{src}); } nfc_string& operator=(StringView src) { return this->assign(src); } nfc_string& operator=(String && src) { return this->assign(std::move(src)); } nfc_string& operator=(const CharT* src) { return this->assign(StringView{src}); } /// read-only: shares representation operator const String&() const noexcept { return s; } /// read-only: shares representation const String& get() const noexcept { return s;} /// read write: copy content operator String() const { return s; } const CharT* c_str() const noexcept { return s.c_str(); } const CharT* data() const noexcept { return s.data(); } std::size_t size() const noexcept { return s.size(); } bool empty() const noexcept { return s.empty(); } std::size_t capacity() const noexcept { return s.capacity(); } void reserve(std::size_t new_capacity) { s.reserve(new_capacity); } void shrink_to_fit() { s.shrink_to_fit(); } const_reference operator[](std::size_t ofs) const noexcept { return s[ofs]; } const_reference at(std::size_t ofs) const { return s.at(ofs); } const_reference front() const noexcept { return s.front(); } const_reference back() const noexcept { return s.back(); } operator StringView() const noexcept { return StringView{s}; } const_iterator begin() const noexcept { return s.cbegin(); } const_iterator cbegin() const noexcept { return s.cbegin(); } /// r/o access only const_iterator end() const noexcept { return s.cend(); } const_iterator cend() const noexcept { return s.cend(); } /// r/o access only void clear() { s.clear(); } /// I am lazy and delegate all the 10 different insert() overloads directly to s. template nfc_string& insert(Args&& ...args) { s.insert( std::forward(args)... ); normalize(); return *this; } /// delegates all erase() overloads to s. template nfc_string& erase(Args&& ...args) { s.erase( std::forward(args)... ); normalize(); return *this; } nfc_string& push_back(CharT c); /// delegates all 9 append() overloads to s. template nfc_string& append(Args&& ...args) { s.append( std::forward(args)... ); normalize(); return *this; } /// more expensive, because 's' might not be in NFC. nfc_string& operator+=(StringView s); /// optimization possible to avoid re-normalization in most cases. nfc_string& operator+=(const nfc_string& s); /// optimization possible to avoid re-normalization in most cases. nfc_string& operator+=(CharT c) { push_back(c); return *this; } /// delegates all 9 compare() overloads to s template int compare(Args&& ...args) const { return s.compare( std::forward(args)... ); } /// stolen from C++20 bool starts_with(StringView s) const noexcept; /// stolen from C++20 bool ends_with(StringView s) const noexcept; /// delegates all 5 find() overloads to s template std::size_t find(Args&& ...args) const { return s.find( std::forward(args)... ); } /// might throw illegal_utf, if a multi-char sequence is clipped. nfc_string substr(std::size_t pos=0, std::size_t count=npos) const; private: std::basic_string s; /// (re-)normalize the content string s. void normalize(); }; }; /// can be more efficient than the operator+() below. template typename UTF::nfc_string operator+( typename UTF::nfc_string left, const typename UTF::nfc_string& right); template inline typename UTF::nfc_string operator+(typename UTF::nfc_string left, const T& right) { return left+=right; } template inline typename UTF::nfc_string operator+(typename UTF::nfc_string&& left, const T& right) { return left+=right; } template inline typename UTF::nfc_string operator+(const T& left, const typename UTF::nfc_string& right) { typename UTF::nfc_string left_s{left}; return left_s+=right; } template inline bool operator<(const typename UTF::nfc_string& left, std::basic_string_view right) { return left inline bool operator==(const typename UTF::nfc_string& left, std::basic_string_view right) { return left==right; } /// convenient alias names: using UTF8 = UTF; using UTF16 = UTF; using nfc_string = UTF8::nfc_string; using nfc_u16string = UTF16::nfc_string; // throws illegal_utf8 exception if s is not valid UTF-8 void assert_utf8(std::string_view s); // convert NFD to NFC std::u32string createNFC(std::u32string nfd_string); /* // return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe" // might throw illegal_utf exception template IsNFC isNFC_quick_check(std::basic_string_view s); // runs first quick check and a deep test if quick check returns "Maybe". template bool isNFC(std::basic_string_view s); // returns true if the sequence is valid UTF-8 bool isUtf8(const char* begin, const char* end); // converts a C++ string (in UTF-8) into NFC form // s is ''moved'' to the return value if possible so no copy is done here. template std::basic_string toNFC(std::basic_string_view s); */ // creates a UTF-8-encoded NFC string from s std::string toNFC_8(std::u16string_view s); // convenience functions to avoid ::strdup(pEp::toNFC(text).c_str()); // and unecessary temporary std::string etc. char* strdup_NFC(std::string_view s); pEp_identity *identity_dup_NFC(const ::pEp_identity* value); ::identity_list* identity_list_dup_NFC(const ::identity_list* value); } // end of namespace pEp #endif // LIBPEPDATATYPES_NFC_HH