You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

327 lines
9.8 KiB

// This file is under GNU General Public License 3.0
// see LICENSE.txt
#ifndef LIBPEPDATATYPES_NFC_HH
#define LIBPEPDATATYPES_NFC_HH
#include <string_view>
#include <string>
#include <stdexcept>
#include <iosfwd>
#include <boost/operators.hpp>
#include <pEp/identity_list.h>
namespace pEp {
/// Tri-sate return value of isNFC_quick_check()
enum class IsNFC
{
No=0, //!< string contains a character that cannot occur in NFC
Maybe=1, //!< string contains a character that is only allowed in certain positions in NFC
Yes=2 //!< string contains no invalid or partially valid character
};
std::ostream& operator<<(std::ostream& o, IsNFC is_nfc);
/// Exception class thrown whenever a string is parsed that is not a valid
/// UTF-8 or UTF-16 sequence.
class illegal_utf : public std::runtime_error
{
public:
illegal_utf( std::string_view, unsigned position, const std::string& reason);
illegal_utf(std::u16string_view, unsigned position, const std::string& reason);
explicit illegal_utf(const std::string& message);
};
/// Common class template to define the same functions for all 3 Unicode Transfer Formats.
template<class CharT>
class UTF
{
public:
/// parses a sequence of input code units into one Unicode code point and updates the input iterator c.
/// \todo change to iterator templates?
static
uint32_t parse(const CharT*& c, const CharT* end);
/// generates a UTF sequence from a given Unicode code point.
template<class OutIter>
static
void generate(const char32_t c, OutIter& out);
/// returns whether the sequence starts with IsNFC==Yes char
static
bool is_safe_NFC_start(std::basic_string_view<CharT> s);
/// returns No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe".
/// use isNFC() for a comprehensive NFC check.
/// Might throw illegal_utf exception
static
IsNFC isNFC_quick_check(std::basic_string_view<CharT> s);
/// runs first quick check and a deep test if quick check returns "Maybe".
static
bool isNFC(std::basic_string_view<CharT> s);
/// returns true if the sequence is valid UTF-8
static
bool isUtf(const CharT* begin, const CharT* end);
/// converts a C++ string (in UTF-8/-16) into NFC form
static
std::basic_string<CharT> toNFC(std::basic_string_view<CharT> s);
/// calculates the number of "code units" in the target Unicode Transfer Format.
static
size_t utf_length(std::u32string_view s);
/// generates a whole u32string at once
static
std::basic_string<CharT> generate(const std::u32string& s);
/// creates an NFD u32string from UTF-8/UTF-16 input string s
static
std::u32string fromUtf_decompose(std::basic_string_view<CharT> s);
/// class holding a NFC-conform Unicode string.
/// content is mostly read-only, because arbitrary modifications might destroy NFC conformacy.
class nfc_string : public boost::totally_ordered2<nfc_string, std::basic_string_view<CharT>>
{
public:
typedef std::basic_string<CharT> String;
typedef std::basic_string_view<CharT> StringView;
/// only const_reference is supported.
typedef typename String::const_reference const_reference;
typedef typename String::const_pointer const_pointer;
/// only forward iterator. Does a backward_iterator make sense in UTF-encoded strings?
typedef typename String::const_iterator const_iterator;
static
constexpr size_t npos = String::npos;
explicit nfc_string(StringView src);
explicit nfc_string(String && src);
/// construct from a NUL-terminated src
explicit nfc_string(const CharT* src)
: nfc_string{ StringView{src} }
{}
nfc_string(const CharT* src, size_t length)
: nfc_string{ StringView{src, length} }
{}
nfc_string() = default;
nfc_string(const nfc_string& src) = default;
nfc_string( nfc_string&& src) = default;
nfc_string& operator=(const nfc_string& src) = default;
nfc_string& operator=( nfc_string&& src) = default;
nfc_string& assign(StringView src);
nfc_string& assign(String && src);
nfc_string& assign(const CharT* src) { return this->assign(StringView{src}); }
nfc_string& operator=(StringView src) { return this->assign(src); }
nfc_string& operator=(String && src) { return this->assign(std::move(src)); }
nfc_string& operator=(const CharT* src) { return this->assign(StringView{src}); }
/// read-only: shares representation
operator const String&() const noexcept { return s; }
/// read-only: shares representation
const String& get() const noexcept { return s;}
/// read write: copy content
operator String() const { return s; }
const CharT* c_str() const noexcept { return s.c_str(); }
const CharT* data() const noexcept { return s.data(); }
std::size_t size() const noexcept { return s.size(); }
bool empty() const noexcept { return s.empty(); }
std::size_t capacity() const noexcept { return s.capacity(); }
void reserve(std::size_t new_capacity) { s.reserve(new_capacity); }
void shrink_to_fit() { s.shrink_to_fit(); }
const_reference operator[](std::size_t ofs) const noexcept { return s[ofs]; }
const_reference at(std::size_t ofs) const { return s.at(ofs); }
const_reference front() const noexcept { return s.front(); }
const_reference back() const noexcept { return s.back(); }
operator StringView() const noexcept { return StringView{s}; }
const_iterator begin() const noexcept { return s.cbegin(); }
const_iterator cbegin() const noexcept { return s.cbegin(); } /// r/o access only
const_iterator end() const noexcept { return s.cend(); }
const_iterator cend() const noexcept { return s.cend(); } /// r/o access only
void clear() { s.clear(); }
/// I am lazy and delegate all the 10 different insert() overloads directly to s.
template<typename... Args>
nfc_string& insert(Args&& ...args)
{
s.insert( std::forward<Args>(args)... );
normalize();
return *this;
}
/// delegates all erase() overloads to s.
template<typename... Args>
nfc_string& erase(Args&& ...args)
{
s.erase( std::forward<Args>(args)... );
normalize();
return *this;
}
nfc_string& push_back(CharT c);
/// delegates all 9 append() overloads to s.
template<typename... Args>
nfc_string& append(Args&& ...args)
{
s.append( std::forward<Args>(args)... );
normalize();
return *this;
}
/// more expensive, because 's' might not be in NFC.
nfc_string& operator+=(StringView s);
/// optimization possible to avoid re-normalization in most cases.
nfc_string& operator+=(const nfc_string& s);
/// optimization possible to avoid re-normalization in most cases.
nfc_string& operator+=(CharT c) { push_back(c); return *this; }
/// delegates all 9 compare() overloads to s
template<typename... Args>
int compare(Args&& ...args) const
{
return s.compare( std::forward<Args>(args)... );
}
/// stolen from C++20
bool starts_with(StringView s) const noexcept;
/// stolen from C++20
bool ends_with(StringView s) const noexcept;
/// delegates all 5 find() overloads to s
template<typename... Args>
std::size_t find(Args&& ...args) const
{
return s.find( std::forward<Args>(args)... );
}
/// might throw illegal_utf, if a multi-char sequence is clipped.
nfc_string substr(std::size_t pos=0, std::size_t count=npos) const;
private:
std::basic_string<CharT> s;
/// (re-)normalize the content string s.
void normalize();
};
};
/// can be more efficient than the operator+() below.
template<class CharT>
typename
UTF<CharT>::nfc_string operator+(
typename UTF<CharT>::nfc_string left,
const typename UTF<CharT>::nfc_string& right);
template<class CharT, class T>
inline
typename
UTF<CharT>::nfc_string operator+(typename UTF<CharT>::nfc_string left, const T& right)
{
return left+=right;
}
template<class CharT, class T>
inline
typename
UTF<CharT>::nfc_string operator+(typename UTF<CharT>::nfc_string&& left, const T& right)
{
return left+=right;
}
template<class CharT, class T>
inline
typename
UTF<CharT>::nfc_string operator+(const T& left, const typename UTF<CharT>::nfc_string& right)
{
typename UTF<CharT>::nfc_string left_s{left};
return left_s+=right;
}
template<class CharT>
inline
bool operator<(const typename UTF<CharT>::nfc_string& left, std::basic_string_view<CharT> right)
{
return left<right;
}
template<class CharT>
inline
bool operator==(const typename UTF<CharT>::nfc_string& left, std::basic_string_view<CharT> right)
{
return left==right;
}
/// convenient alias names:
using UTF8 = UTF<char>;
using UTF16 = UTF<char16_t>;
using nfc_string = UTF8::nfc_string;
using nfc_u16string = UTF16::nfc_string;
// throws illegal_utf8 exception if s is not valid UTF-8
void assert_utf8(std::string_view s);
// convert NFD to NFC
std::u32string createNFC(std::u32string nfd_string);
/*
// return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe"
// might throw illegal_utf exception
template<class CharT>
IsNFC isNFC_quick_check(std::basic_string_view<CharT> s);
// runs first quick check and a deep test if quick check returns "Maybe".
template<class CharT>
bool isNFC(std::basic_string_view<CharT> s);
// returns true if the sequence is valid UTF-8
bool isUtf8(const char* begin, const char* end);
// converts a C++ string (in UTF-8) into NFC form
// s is ''moved'' to the return value if possible so no copy is done here.
template<class CharT>
std::basic_string<CharT> toNFC(std::basic_string_view<CharT> s);
*/
// creates a UTF-8-encoded NFC string from s
std::string toNFC_8(std::u16string_view s);
// convenience functions to avoid ::strdup(pEp::toNFC<char>(text).c_str());
// and unecessary temporary std::string etc.
char* strdup_NFC(std::string_view s);
pEp_identity *identity_dup_NFC(const ::pEp_identity* value);
::identity_list* identity_list_dup_NFC(const ::identity_list* value);
} // end of namespace pEp
#endif // LIBPEPDATATYPES_NFC_HH