From a62e6f84e3b53281fed8c01b2fa3e4cb838fb252 Mon Sep 17 00:00:00 2001 From: Leonardo de Moura Date: Sat, 2 Aug 2014 00:47:17 -0700 Subject: [PATCH] feat(frontends/lean/scanner): allow letter-like unicode characters and sub/super-scripts in identifiers Signed-off-by: Leonardo de Moura --- src/frontends/lean/scanner.cpp | 56 ++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/src/frontends/lean/scanner.cpp b/src/frontends/lean/scanner.cpp index bca126576..54cbdf895 100644 --- a/src/frontends/lean/scanner.cpp +++ b/src/frontends/lean/scanner.cpp @@ -32,6 +32,52 @@ unsigned scanner::get_utf8_size(unsigned char c) { throw_exception("invalid utf-8 head character"); } +unsigned char to_uchar(char c) { return static_cast(c); } + +unsigned utf8_to_unicode(char const * begin, char const * end) { + unsigned result = 0; + if (begin == end) + return result; + auto it = begin; + unsigned c = to_uchar(*it); + ++it; + if (c < 128) + return c; + unsigned mask = (1u << 6) -1; + unsigned hmask = mask; + unsigned shift = 0; + unsigned num_bits = 0; + while ((c & 0xC0) == 0xC0) { + c <<= 1; + c &= 0xff; + num_bits += 6; + hmask >>= 1; + shift++; + result <<= 6; + if (it == end) + return 0; + result |= *it & mask; + ++it; + } + result |= ((c >> shift) & hmask) << num_bits; + return result; +} + +bool is_greek_unicode(unsigned u) { return 0x391 <= u && u <= 0x3DD; } +bool is_letter_like_unicode(unsigned u) { + return + (0x3b1 <= u && u <= 0x3c9 && u != 0x3bb) || // Lower greek, but lambda + (0x3ca <= u && u <= 0x3fb) || // Coptic letters + (0x1f00 <= u && u <= 0x1ffe) || // Polytonic Greek Extended Character Set + (0x2100 <= u && u <= 0x214f); // Letter like block +} +bool is_super_sub_script_alnum_unicode(unsigned u) { + return + (0x2070 <= u && u <= 0x2079) || + (0x207f <= u && u <= 0x2089) || + (0x2090 <= u && u <= 0x209c); +} + void scanner::set_line(unsigned p) { m_line = p; m_sline = p; @@ -279,11 +325,17 @@ void scanner::next_utf(buffer & cs) { } static bool is_id_first(buffer const & cs, unsigned i) { - return std::isalpha(cs[i]) || cs[i] == '_'; + if (std::isalpha(cs[i]) || cs[i] == '_') + return true; + unsigned u = utf8_to_unicode(cs.begin() + i, cs.end()); + return is_letter_like_unicode(u); } static bool is_id_rest(buffer const & cs, unsigned i) { - return std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\''; + if (std::isalnum(cs[i]) || cs[i] == '_' || cs[i] == '\'') + return true; + unsigned u = utf8_to_unicode(cs.begin() + i, cs.end()); + return is_letter_like_unicode(u) || is_super_sub_script_alnum_unicode(u); } auto scanner::read_key_cmd_id() -> token_kind {