From 1a9c3fc1a8df94a3b0dde078747fbd4f9ed14840 Mon Sep 17 00:00:00 2001 From: roker Date: Fri, 2 Jul 2021 15:14:45 +0200 Subject: [PATCH] implement parseUtf16(). --- src/nfc.cc | 88 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/src/nfc.cc b/src/nfc.cc index 095d6b6..f3d7c0a 100644 --- a/src/nfc.cc +++ b/src/nfc.cc @@ -29,66 +29,93 @@ namespace return buf; } + // hex string of a 16-bit value + std::string hex16(char16_t u) + { + char buf[16] = {0}; + snprintf(buf, 15, "0x%04X", u); + return buf; + } + - class utf8_exception + class utf_exception { public: - utf8_exception(uint8_t u) : octet(u) {} - virtual ~utf8_exception() = default; + utf_exception(uint16_t u) : octet(u), value(u) {} + virtual ~utf_exception() = default; virtual std::string reason() const = 0; uint8_t octet; + uint16_t value; }; - class cont_without_start : public utf8_exception + class cont_without_start : public utf_exception { public: - cont_without_start(uint8_t u) : utf8_exception(u) {} + cont_without_start(uint8_t u) : utf_exception(u) {} std::string reason() const override { return "Continuation octet " + o2h(octet) + " without start octet"; } }; - class overlong_sequence : public utf8_exception + class overlong_sequence : public utf_exception { public: - overlong_sequence(uint8_t octet, unsigned u) : utf8_exception(octet), unicode(u) {} + overlong_sequence(uint8_t octet, unsigned u) : utf_exception(octet), unicode(u) {} std::string reason() const override { return "Overlong sequence for " + u2h(unicode); } unsigned unicode; }; - class unexpected_end : public utf8_exception + class unexpected_end : public utf_exception { public: - unexpected_end(uint8_t u) : utf8_exception(u) {} + unexpected_end(uint8_t u) : utf_exception(u) {} std::string reason() const override { return "Unexpected end of string"; } }; - class surrogate : public utf8_exception + class surrogate : public utf_exception { public: - surrogate(uint8_t u, unsigned s) : utf8_exception(u), surr(s) {} + surrogate(uint8_t u, unsigned s) : utf_exception(u), surr(s) {} std::string reason() const override { return "UTF-8-encoded UTF-16 surrogate " + u2h(surr) + " detected"; } private: unsigned surr; }; - class no_unicode : public utf8_exception + class no_unicode : public utf_exception { public: - explicit no_unicode(uint8_t _octet) : utf8_exception(_octet) {} + explicit no_unicode(uint8_t _octet) : utf_exception(_octet) {} std::string reason() const override { return "Octet " + o2h(octet) + " is illegal in UTF-8"; } }; - class too_big : public utf8_exception + class too_big : public utf_exception { public: - explicit too_big(uint8_t _octet, unsigned u) : utf8_exception(_octet), unicode(u) {} + explicit too_big(uint8_t _octet, unsigned u) : utf_exception(_octet), unicode(u) {} std::string reason() const override { return "Value " + u2h(unicode) + " is too big for Unicode"; } unsigned unicode; }; + class unexpected_low_surrogate : public utf_exception + { + public: + explicit unexpected_low_surrogate(char16_t c) : utf_exception(c) {} + std::string reason() const override { return "Unexpected low surogate " + hex16(value); } + }; + + + class missing_low_surrogate : public utf_exception + { + public: + explicit missing_low_surrogate(char16_t c, char16_t _surr) : utf_exception(c), surr(_surr) {} + std::string reason() const override { return "Non-low surrogate value " + hex16(value) + " is unexpected after high surogate " + hex16(surr); } + private: + char16_t surr; + }; + + std::string escape(pEp::string_view s) { std::string ret; ret.reserve(s.size() + 16 ); @@ -274,6 +301,33 @@ uint32_t parseUtf8(const char*& c, const char* end) } +uint32_t parseUtf16(const char16_t*& c, const char16_t* end) +{ + while(c=0xE000) + { + return u; + }else{ + if(u>=0xDC00) + { + throw unexpected_low_surrogate(u); + } + ++c; + if(c==end) throw unexpected_end(u); + const uint16_t low = *c; + if(low < 0xDC00 || low > 0xDFFF) + { + throw missing_low_surrogate(low, u); + } + return (u-0xD800) * 1024 + (low-0xDC00) + 0x10000; + } + } + throw unexpected_end(-1); +} + + void toUtf8(const char32_t c, std::string& ret) { if(c<=0x7F) @@ -333,7 +387,7 @@ void assert_utf8(string_view s) ++begin; } } - catch(const utf8_exception& e) + catch(const utf_exception& e) { throw illegal_utf8(s, begin - s.data(), e.reason()); } @@ -446,7 +500,7 @@ IsNFC isNFC_quick_check(string_view s) last_cc = cc; } } - catch(const utf8_exception& e) + catch(const utf_exception& e) { throw illegal_utf8(s, begin - s.data(), e.reason()); }