
4 changed files with 3671 additions and 0 deletions
@ -0,0 +1,505 @@ |
|||
// This file is under GNU General Public License 3.0
|
|||
// see LICENSE.txt
|
|||
|
|||
// converts a C++ string into NFC form
|
|||
|
|||
#include "nfc.hh" |
|||
#include <cstdint> |
|||
#include <set> |
|||
#include <ostream> |
|||
#include <algorithm> |
|||
|
|||
#include "nfc_sets.hh" |
|||
|
|||
namespace |
|||
{ |
|||
// unicode to hex string
|
|||
std::string u2h(unsigned u) |
|||
{ |
|||
char buf[16] = {0}; |
|||
snprintf(buf, 15, "<U+%04X>", u ); |
|||
return buf; |
|||
} |
|||
|
|||
// octet to hex string
|
|||
std::string o2h(uint8_t octet) |
|||
{ |
|||
char buf[16] = {0}; |
|||
snprintf(buf, 15, "0x%02hhX", octet); |
|||
return buf; |
|||
} |
|||
|
|||
|
|||
class utf8_exception |
|||
{ |
|||
public: |
|||
utf8_exception(uint8_t u) : octet(u) {} |
|||
virtual ~utf8_exception() = default; |
|||
virtual std::string reason() const = 0; |
|||
uint8_t octet; |
|||
}; |
|||
|
|||
|
|||
class cont_without_start : public utf8_exception |
|||
{ |
|||
public: |
|||
cont_without_start(uint8_t u) : utf8_exception(u) {} |
|||
std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; } |
|||
}; |
|||
|
|||
|
|||
class overlong_sequence : public utf8_exception |
|||
{ |
|||
public: |
|||
overlong_sequence(uint8_t octet, unsigned u) : utf8_exception(octet), unicode(u) {} |
|||
std::string reason() const override { return "Overlong sequence for " + u2h(unicode); } |
|||
unsigned unicode; |
|||
}; |
|||
|
|||
|
|||
class unexpected_end : public utf8_exception |
|||
{ |
|||
public: |
|||
unexpected_end(uint8_t u) : utf8_exception(u) {} |
|||
std::string reason() const override { return "Unexpected end of string"; } |
|||
}; |
|||
|
|||
class surrogate : public utf8_exception |
|||
{ |
|||
public: |
|||
surrogate(uint8_t u, unsigned s) : utf8_exception(u), surr(s) {} |
|||
std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; } |
|||
private: |
|||
unsigned surr; |
|||
}; |
|||
|
|||
class no_unicode : public utf8_exception |
|||
{ |
|||
public: |
|||
explicit no_unicode(uint8_t _octet) : utf8_exception(_octet) {} |
|||
std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; } |
|||
}; |
|||
|
|||
class too_big : public utf8_exception |
|||
{ |
|||
public: |
|||
explicit too_big(uint8_t _octet, unsigned u) : utf8_exception(_octet), unicode(u) {} |
|||
std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; } |
|||
unsigned unicode; |
|||
}; |
|||
|
|||
|
|||
std::string escape(sv s) |
|||
{ |
|||
std::string ret; ret.reserve(s.size() + 16 ); |
|||
for(char c : s) |
|||
{ |
|||
const uint8_t u = c; |
|||
if(u>=32 && u<=126) |
|||
{ |
|||
ret += c; |
|||
}else{ |
|||
char buf[16]; |
|||
snprintf(buf,15, "«%02x»", u ); |
|||
ret += buf; |
|||
} |
|||
} |
|||
return ret; |
|||
} |
|||
|
|||
// returns the "CanonicalCombinincClass" of the given Unicode codpoint u
|
|||
unsigned canonicalClass(unsigned u) |
|||
{ |
|||
const auto q = NFC_CombiningClass.find(u); |
|||
if(q==NFC_CombiningClass.end()) |
|||
{ |
|||
return 0; // not found in map.
|
|||
}else{ |
|||
return q->second; |
|||
} |
|||
} |
|||
|
|||
std::pair<int,int> decompose(unsigned u) |
|||
{ |
|||
const auto q = NFC_Decompose.find(u); |
|||
if(q==NFC_Decompose.end()) |
|||
{ |
|||
return std::make_pair(-1, -1); |
|||
}else{ |
|||
return q->second; |
|||
} |
|||
} |
|||
|
|||
std::u32string decompose_full(unsigned u) |
|||
{ |
|||
const std::pair<int,int> d = decompose(u); |
|||
if(d.first<0) |
|||
{ |
|||
return std::u32string( 1, char32_t(u) ); |
|||
}else{ |
|||
if(d.second<0) |
|||
{ |
|||
return decompose_full(d.first); |
|||
} |
|||
} |
|||
return decompose_full(d.first) + decompose_full(d.second); |
|||
} |
|||
|
|||
|
|||
// according to Unicode Standard, clause D108:
|
|||
bool isReorderablePair(unsigned a, unsigned b) |
|||
{ |
|||
const unsigned cca = canonicalClass(a); |
|||
const unsigned ccb = canonicalClass(b); |
|||
|
|||
return (cca > ccb) && (ccb>0); |
|||
} |
|||
|
|||
// Unicode standard requires bubble sort, for stability reasons?
|
|||
void canonicalOrdering(std::u32string& us) |
|||
{ |
|||
if(us.size()<2) |
|||
return; |
|||
|
|||
for(unsigned n=us.size(); n>1; --n) |
|||
for(unsigned i=0; i<n-1; ++i) |
|||
{ |
|||
char32_t& a = us[i]; |
|||
char32_t& b = us[i+1]; |
|||
if( isReorderablePair(a,b) ) |
|||
{ |
|||
std::swap(a,b); |
|||
} |
|||
} |
|||
} |
|||
|
|||
} // end of anonymous namespace
|
|||
|
|||
|
|||
std::ostream& operator<<(std::ostream& o, IsNFC is_nfc) |
|||
{ |
|||
switch(is_nfc) |
|||
{ |
|||
case IsNFC::No : return o << "No"; |
|||
case IsNFC::Maybe : return o << "Maybe"; |
|||
case IsNFC::Yes : return o << "Yes"; |
|||
} |
|||
throw std::logic_error("Unknown value of IsNFC"); |
|||
} |
|||
|
|||
|
|||
uint32_t parseUtf8(const char*& c, const char* end) |
|||
{ |
|||
while(c<end) |
|||
{ |
|||
const uint8_t u = uint8_t(*c); |
|||
|
|||
if (u<=0x7f) |
|||
{ |
|||
return u; |
|||
} else if (u<=0xBF) |
|||
{ |
|||
throw cont_without_start(u); |
|||
} else if (u<=0xC1) // 0xC0, 0xC1 would form "overlong sequences" and are therefore always illegal in UTF-8
|
|||
{ |
|||
throw no_unicode(u); |
|||
} else if (u<=0xDF) // 2 octet sequence
|
|||
{ |
|||
++c; |
|||
if(c==end) throw unexpected_end(u); |
|||
const uint8_t uu = uint8_t(*c); |
|||
if((uu & 0xC0) != 0x80) |
|||
{ |
|||
throw unexpected_end(uu); |
|||
} |
|||
return ((u & 0x1F) << 6) + (uu & 0x3F); |
|||
} else if (u<=0xEF) // 3 octet sequence
|
|||
{ |
|||
++c; |
|||
if(c==end) throw unexpected_end(u); |
|||
const uint8_t uu = uint8_t(*c); |
|||
if((uu & 0xC0) != 0x80) |
|||
{ |
|||
throw unexpected_end(uu); |
|||
} |
|||
++c; |
|||
if(c==end) throw unexpected_end(uu); |
|||
const uint8_t uuu = uint8_t(*c); |
|||
if((uuu & 0xC0) != 0x80) |
|||
{ |
|||
throw unexpected_end(uuu); |
|||
} |
|||
|
|||
const uint32_t ret = ((u & 0xF) << 12) + ((uu & 0x3F)<<6) + (uuu & 0x3F); |
|||
if(ret<0x800) throw overlong_sequence(u, ret); |
|||
if(ret>=0xD800 && ret<=0xDFFF) throw surrogate(u, ret); |
|||
return ret; |
|||
} else if (u<=0xF4) // 4 octet sequence
|
|||
{ |
|||
++c; |
|||
if(c==end) throw unexpected_end(u); |
|||
const uint8_t uu = uint8_t(*c); |
|||
if((uu & 0xC0) != 0x80) |
|||
{ |
|||
throw unexpected_end(uu); |
|||
} |
|||
++c; |
|||
if(c==end) throw unexpected_end(uu); |
|||
const uint8_t uuu = uint8_t(*c); |
|||
if((uuu & 0xC0) != 0x80) |
|||
{ |
|||
throw unexpected_end(uuu); |
|||
} |
|||
++c; |
|||
if(c==end) throw unexpected_end(uuu); |
|||
const uint8_t uuuu = uint8_t(*c); |
|||
if((uuuu & 0xC0) != 0x80) |
|||
{ |
|||
throw unexpected_end(uuuu); |
|||
} |
|||
|
|||
const uint32_t ret = ((u & 0xF) << 18) + ((uu & 0x3F)<<12) + ((uuu & 0x3F)<<6) + (uuuu & 0x3F); |
|||
if(ret<0x10000) throw overlong_sequence(u, ret); |
|||
if(ret>0x10FFFF) throw too_big(u, ret); |
|||
return ret; |
|||
} else |
|||
{ |
|||
throw no_unicode(u); |
|||
} |
|||
} |
|||
|
|||
throw unexpected_end(-1); |
|||
} |
|||
|
|||
|
|||
void toUtf8(const char32_t c, std::string& ret) |
|||
{ |
|||
if(c<=0x7F) |
|||
{ |
|||
ret += char(c); |
|||
}else if(c<=0x7FF) |
|||
{ |
|||
ret += char( 0xC0 + (c>>6) ); |
|||
ret += char( 0x80 + (c & 63)); |
|||
}else if(c<=0xFFFF) |
|||
{ |
|||
ret += char( 0xE0 + (c>>12) ); |
|||
ret += char( 0x80 + ((c>>6) & 63)); |
|||
ret += char( 0x80 + (c & 63)); |
|||
}else if(c<=0x10FFFF) |
|||
{ |
|||
ret += char( 0xF0 + (c>>18) ); |
|||
ret += char( 0x80 + ((c>>12) & 63)); |
|||
ret += char( 0x80 + ((c>>6) & 63)); |
|||
ret += char( 0x80 + (c & 63)); |
|||
}else{ |
|||
throw too_big(0, c); |
|||
} |
|||
} |
|||
|
|||
|
|||
std::string toUtf8(const std::u32string& u32) |
|||
{ |
|||
std::string ret; |
|||
for(char32_t c : u32) |
|||
{ |
|||
toUtf8(c, ret); |
|||
} |
|||
return ret; |
|||
} |
|||
|
|||
|
|||
illegal_utf8::illegal_utf8( sv s, unsigned position, const std::string& reason) |
|||
: std::runtime_error( "Illegal UTF-8 string \"" + escape(s) + "\" at position " + std::to_string(position) + ": " + reason ) |
|||
{} |
|||
|
|||
|
|||
illegal_utf8::illegal_utf8( const std::string& msg ) |
|||
: std::runtime_error( msg ) |
|||
{} |
|||
|
|||
|
|||
void assert_utf8(sv s) |
|||
{ |
|||
const char* begin = s.data(); |
|||
const char* const end = s.data() + s.size(); |
|||
try |
|||
{ |
|||
while(begin<end) |
|||
{ |
|||
parseUtf8(begin, end); // ignore the output
|
|||
++begin; |
|||
} |
|||
} |
|||
catch(const utf8_exception& e) |
|||
{ |
|||
throw illegal_utf8(s, begin - s.data(), e.reason()); |
|||
} |
|||
} |
|||
|
|||
|
|||
// creates a NFD string from s
|
|||
std::u32string fromUtf8_decompose(sv s) |
|||
{ |
|||
std::u32string u32s; |
|||
u32s.reserve( static_cast<std::size_t>(s.size()*1.25) ); |
|||
const char* begin = s.data(); |
|||
const char* end = s.data() + s.size(); |
|||
for(; begin<end; ++begin) |
|||
{ |
|||
unsigned u = parseUtf8(begin, end); |
|||
u32s += decompose_full(u); |
|||
} |
|||
canonicalOrdering(u32s); // works inplace.
|
|||
return u32s; |
|||
} |
|||
|
|||
|
|||
template<class Iter> |
|||
bool blocked(Iter L, Iter C) |
|||
{ |
|||
Iter B = L; ++B; |
|||
for(;B!=C;++B) |
|||
{ |
|||
if(canonicalClass(*B)==0 || canonicalClass(*B)==canonicalClass(*C)) |
|||
return true; |
|||
} |
|||
return false; |
|||
} |
|||
|
|||
|
|||
template<class Iter> |
|||
void combine(std::u32string& nfc, Iter starter, Iter next_starter) |
|||
{ |
|||
Iter c = starter; ++c; |
|||
for(;c!=next_starter; ++c) |
|||
{ |
|||
if(!blocked(starter, c)) |
|||
{ |
|||
const unsigned starter_u = *starter; |
|||
const unsigned c_u = *c; |
|||
|
|||
auto q = NFC_Compose.find( std::make_pair(starter_u,c_u) ); |
|||
if(q!=NFC_Compose.end()) |
|||
{ |
|||
*starter = q->second; |
|||
*c = -1; |
|||
} |
|||
} |
|||
} |
|||
|
|||
// now add the remaining/changed characters to the NFC string:
|
|||
for(Iter c = starter; c!=next_starter; ++c) |
|||
{ |
|||
if( int(*c) >= 0) |
|||
{ |
|||
nfc += *c; |
|||
} |
|||
} |
|||
} |
|||
|
|||
// the nfd string is changed during composing process. So it works on a copy or call with std::move().
|
|||
std::u32string createNFC(std::u32string nfd) |
|||
{ |
|||
if(nfd.size()<=1) |
|||
return nfd; |
|||
|
|||
std::u32string nfc; |
|||
nfc.reserve(nfd.size()); |
|||
auto starter = nfd.begin(); |
|||
while( starter != nfd.end() ) |
|||
{ |
|||
if( canonicalClass(*starter)!=0 ) |
|||
{ |
|||
nfc += *starter; |
|||
++starter; |
|||
}else{ |
|||
auto next_starter = std::find_if(starter+1, nfd.end(), [](char32_t c){return canonicalClass(c)==0;} ); |
|||
combine(nfc, starter, next_starter); |
|||
starter = next_starter; |
|||
} |
|||
} |
|||
return nfc; |
|||
} |
|||
|
|||
|
|||
IsNFC isNFC_quick_check(sv s) |
|||
{ |
|||
const char* begin = s.data(); |
|||
const char* const end = s.data() + s.size(); |
|||
try |
|||
{ |
|||
unsigned last_cc = 0; |
|||
while(begin<end) |
|||
{ |
|||
const uint32_t u = parseUtf8(begin, end); |
|||
const unsigned cc = canonicalClass(u); |
|||
if( (cc!=0) && (last_cc > cc) ) |
|||
{ |
|||
return IsNFC::No; |
|||
} |
|||
if(NFC_No.count(u)) return IsNFC::No; |
|||
if(NFC_Maybe.count(u)) return IsNFC::Maybe; |
|||
++begin; |
|||
last_cc = cc; |
|||
} |
|||
} |
|||
catch(const utf8_exception& e) |
|||
{ |
|||
throw illegal_utf8(s, begin - s.data(), e.reason()); |
|||
} |
|||
return IsNFC::Yes; |
|||
} |
|||
|
|||
|
|||
bool isNFC(sv s) |
|||
{ |
|||
switch( isNFC_quick_check(s) ) |
|||
{ |
|||
case IsNFC::Yes : return true; |
|||
case IsNFC::No : return false; |
|||
case IsNFC::Maybe: |
|||
{ |
|||
return s == toNFC(s); // very expensive!
|
|||
} |
|||
} |
|||
|
|||
throw -1; // could never happen, but compiler is too dumb to see this.
|
|||
} |
|||
|
|||
bool isUtf8(const char* begin, const char* end) |
|||
try{ |
|||
for(; begin<end; ++begin) |
|||
{ |
|||
(void)parseUtf8(begin, end); |
|||
} |
|||
return true; |
|||
}catch(const illegal_utf8&) |
|||
{ |
|||
return false; |
|||
} |
|||
|
|||
// s is ''moved'' to the return value if possible so no copy is done here.
|
|||
std::string toNFC(sv s) |
|||
{ |
|||
if(isNFC_quick_check(s)==IsNFC::Yes) |
|||
return std::string{s}; |
|||
|
|||
return toUtf8( createNFC( fromUtf8_decompose(s) )); |
|||
} |
|||
|
|||
|
|||
// used only to initialize the NFC Compose mapping:
|
|||
std::map< std::pair<unsigned, unsigned>, unsigned> generate_nfc_compose() |
|||
{ |
|||
std::map< std::pair<unsigned, unsigned>, unsigned> m; |
|||
for(const auto& decomp : NFC_Decompose) |
|||
{ |
|||
if(decomp.second.second >= 0) // skip singleton decompositions
|
|||
{ |
|||
m[ decomp.second ] = decomp.first; |
|||
} |
|||
} |
|||
|
|||
return m; |
|||
} |
@ -0,0 +1,60 @@ |
|||
// This file is under GNU General Public License 3.0
|
|||
// see LICENSE.txt
|
|||
|
|||
#ifndef JSON_ADAPTER_NFC_HH |
|||
#define JSON_ADAPTER_NFC_HH |
|||
|
|||
#include "config.hh" // to switch between std::string_view or boost::string_view.hh |
|||
#include <string> |
|||
#include <stdexcept> |
|||
#include <iosfwd> |
|||
|
|||
enum class IsNFC |
|||
{ |
|||
No=0, // contains a character that cannot occur in NFC
|
|||
Maybe=1, // contains a character that is only allowed in certain positions in NFC
|
|||
Yes=2 // contains no invalid or partially valid character
|
|||
}; |
|||
|
|||
std::ostream& operator<<(std::ostream& o, IsNFC is_nfc); |
|||
|
|||
|
|||
class illegal_utf8 : public std::runtime_error |
|||
{ |
|||
public: |
|||
illegal_utf8(sv, unsigned position, const std::string& reason); |
|||
protected: |
|||
explicit illegal_utf8(const std::string& message); |
|||
}; |
|||
|
|||
|
|||
// scans the char sequences and parses UTF-8 sequences. Detect UTF-8 errors and throws exceptions.
|
|||
uint32_t parseUtf8(const char*& c, const char* end); |
|||
|
|||
// converts 'c' into a UTF-8 sequence and adds it to 'ret'
|
|||
void toUtf8(const char32_t c, std::string& ret); |
|||
|
|||
// throws illegal_utf8 exception if s is not valid UTF-8
|
|||
void assert_utf8(sv s); |
|||
|
|||
// creates an NFD u32string from UTF-8 input string s
|
|||
std::u32string fromUtf8_decompose(sv s); |
|||
|
|||
// convert NFD to NFC
|
|||
std::u32string createNFC(std::u32string nfd_string); |
|||
|
|||
// return No or Maybe, if at least one character with NFC_Quickcheck class is "No" or "Maybe"
|
|||
// might throw illegal_utf8 exception
|
|||
IsNFC isNFC_quick_check(sv s); |
|||
|
|||
// runs first quick check and a deep test if quick check returns "Maybe".
|
|||
bool isNFC(sv s); |
|||
|
|||
// returns true if the sequence is valid UTF-8
|
|||
bool isUtf8(const char* begin, const char* end); |
|||
|
|||
// converts a C++ string (in UTF-8) into NFC form
|
|||
// s is ''moved'' to the return value if possible so no copy is done here.
|
|||
std::string toNFC(sv s); |
|||
|
|||
#endif // JSON_ADAPTER_NFC_HH
|
File diff suppressed because it is too large
@ -0,0 +1,28 @@ |
|||
#ifndef NFC_SETS_HH |
|||
#define NFC_SETS_HH |
|||
|
|||
#include <set> |
|||
#include <map> |
|||
|
|||
// These data structures are filled by code generated automatically
|
|||
// from Unicode's DerivedNormalizationProps.txt and UnicodeData.txt.
|
|||
// see scripts/ subdirectory
|
|||
|
|||
// TODO: (maybe) Replace them by flat_map or sorted arrays, because these might be faster. But make benchmarks first!
|
|||
|
|||
// Contains all codepoints with NFC_No property.
|
|||
extern const std::set<unsigned> NFC_No; |
|||
|
|||
// Contains all codepoints with NFC_Maybe property.
|
|||
extern const std::set<unsigned> NFC_Maybe; |
|||
|
|||
// Contains CanonicalCombiningClass for given codepoints. All others have value 0.
|
|||
extern const std::map<unsigned, unsigned char> NFC_CombiningClass; |
|||
|
|||
// Contains the canonical decomposing pairs. second member might be -1 for single decompositions.
|
|||
extern const std::map<unsigned, std::pair<int,int>> NFC_Decompose; |
|||
|
|||
// canonical composing mapping, except excluded ones according to Unicode TR-15
|
|||
extern const std::map< std::pair<unsigned, unsigned>, unsigned> NFC_Compose; |
|||
|
|||
#endif // NFC_SETS_HH
|
Loading…
Reference in new issue